src/factor.c

   1 /* factor -- print prime factors of n.
   2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
  18    Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
  19    Arbitrary-precision code adapted by James Youngman from Torbjörn
  20    Granlund's factorize.c, from GNU MP version 4.2.2.
  21    In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
  22    Contains code from GNU MP.  */
  23
  24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
  25    or, with GMP, numbers of any size.
  26
  27   Code organisation:
  28
  29     There are several variants of many functions, for handling one word, two
  30     words, and GMP's mpz_t type.  If the one-word variant is called foo, the
  31     two-word variant will be foo2, and the one for mpz_t will be mp_foo.  In
  32     some cases, the plain function variants will handle both one-word and
  33     two-word numbers, evidenced by function arguments.
  34
  35     The factoring code for two words will fall into the code for one word when
  36     progress allows that.
  37
  38   Algorithm:
  39
  40     (1) Perform trial division using a small primes table, but without hardware
  41         division since the primes table store inverses modulo the word base.
  42         (The GMP variant of this code doesn't make use of the precomputed
  43         inverses, but instead relies on GMP for fast divisibility testing.)
  44     (2) Check the nature of any non-factored part using Miller-Rabin for
  45         detecting composites, and Lucas for detecting primes.
  46     (3) Factor any remaining composite part using the Pollard-Brent rho
  47         algorithm or if USE_SQUFOF is defined to 1, try that first.
  48         Status of found factors are checked again using Miller-Rabin and Lucas.
  49
  50     We prefer using Hensel norm in the divisions, not the more familiar
  51     Euclidian norm, since the former leads to much faster code.  In the
  52     Pollard-Brent rho code and the prime testing code, we use Montgomery's
  53     trick of multiplying all n-residues by the word base, allowing cheap Hensel
  54     reductions mod n.
  55
  56     The GMP code uses an algorithm that can be considerably slower;
  57     for example, on a circa-2017 Intel Xeon Silver 4116, factoring
  58     2^{127}-3 takes about 50 ms with the two-word algorithm but would
  59     take about 750 ms with the GMP code.
  60
  61   Improvements:
  62
  63     * Use modular inverses also for exact division in the Lucas code, and
  64       elsewhere.  A problem is to locate the inverses not from an index, but
  65       from a prime.  We might instead compute the inverse on-the-fly.
  66
  67     * Tune trial division table size (not forgetting that this is a standalone
  68       program where the table will be read from secondary storage for
  69       each invocation).
  70
  71     * Implement less naive powm, using k-ary exponentiation for k = 3 or
  72       perhaps k = 4.
  73
  74     * Try to speed trial division code for single uintmax_t numbers, i.e., the
  75       code using DIVBLOCK.  It currently runs at 2 cycles per prime (Intel SBR,
  76       IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
  77       using gcc 4.6 and 4.7.  Some software pipelining should help; 1, 2, and 4
  78       respectively cycles ought to be possible.
  79
  80     * The redcify function could be vastly improved by using (plain Euclidian)
  81       pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
  82       GMP's gmp-impl.h).  The redcify2 function could be vastly improved using
  83       similar methoods.  These functions currently dominate run time when using
  84       the -w option.
  85 */
  86
  87 /* Whether to recursively factor to prove primality,
  88    or run faster probabilistic tests.  */
  89 #ifndef PROVE_PRIMALITY
  90 # define PROVE_PRIMALITY 1
  91 #endif
  92
  93 /* Faster for certain ranges but less general.  */
  94 #ifndef USE_SQUFOF
  95 # define USE_SQUFOF 0
  96 #endif
  97
  98 /* Output SQUFOF statistics.  */
  99 #ifndef STAT_SQUFOF
 100 # define STAT_SQUFOF 0
 101 #endif
 102
 103
 104 #include <config.h>
 105 #include <getopt.h>
 106 #include <stdio.h>
 107 #include <gmp.h>
 108
 109 #include "system.h"
 110 #include "assure.h"
 111 #include "full-write.h"
 112 #include "quote.h"
 113 #include "readtokens.h"
 114 #include "xstrtol.h"
 115
 116 /* The official name of this program (e.g., no 'g' prefix).  */
 117 #define PROGRAM_NAME "factor"
 118
 119 #define AUTHORS \
 120   proper_name ("Paul Rubin"),                                           \
 121   proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"),   \
 122   proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
 123
 124 /* Token delimiters when reading from a file.  */
 125 #define DELIM "\n\t "
 126
 127 #ifndef USE_LONGLONG_H
 128 /* With the way we use longlong.h, it's only safe to use
 129    when UWtype = UHWtype, as there were various cases
 130    (as can be seen in the history for longlong.h) where
 131    for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
 132    to avoid compile time or run time issues.  */
 133 # if LONG_MAX == INTMAX_MAX
 134 #  define USE_LONGLONG_H 1
 135 # endif
 136 #endif
 137
 138 #if USE_LONGLONG_H
 139
 140 /* Make definitions for longlong.h to make it do what it can do for us */
 141
 142 /* bitcount for uintmax_t */
 143 # if UINTMAX_MAX == UINT32_MAX
 144 #  define W_TYPE_SIZE 32
 145 # elif UINTMAX_MAX == UINT64_MAX
 146 #  define W_TYPE_SIZE 64
 147 # elif UINTMAX_MAX == UINT128_MAX
 148 #  define W_TYPE_SIZE 128
 149 # endif
 150
 151 # define UWtype  uintmax_t
 152 # define UHWtype unsigned long int
 153 # undef UDWtype
 154 # if HAVE_ATTRIBUTE_MODE
 155 typedef unsigned int UQItype    __attribute__ ((mode (QI)));
 156 typedef          int SItype     __attribute__ ((mode (SI)));
 157 typedef unsigned int USItype    __attribute__ ((mode (SI)));
 158 typedef          int DItype     __attribute__ ((mode (DI)));
 159 typedef unsigned int UDItype    __attribute__ ((mode (DI)));
 160 # else
 161 typedef unsigned char UQItype;
 162 typedef          long SItype;
 163 typedef unsigned long int USItype;
 164 #  if HAVE_LONG_LONG_INT
 165 typedef long long int DItype;
 166 typedef unsigned long long int UDItype;
 167 #  else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
 168 typedef long int DItype;
 169 typedef unsigned long int UDItype;
 170 #  endif
 171 # endif
 172 # define LONGLONG_STANDALONE     /* Don't require GMP's longlong.h mdep files */
 173 # define ASSERT(x)               /* FIXME make longlong.h really standalone */
 174 # define __GMP_DECLSPEC          /* FIXME make longlong.h really standalone */
 175 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
 176 # ifndef __GMP_GNUC_PREREQ
 177 #  define __GMP_GNUC_PREREQ(a,b) 1
 178 # endif
 179
 180 /* These stub macros are only used in longlong.h in certain system compiler
 181    combinations, so ensure usage to avoid -Wunused-macros warnings.  */
 182 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
 183 ASSERT (1)
 184 __GMP_DECLSPEC
 185 # endif
 186
 187 # if _ARCH_PPC
 188 #  define HAVE_HOST_CPU_FAMILY_powerpc 1
 189 # endif
 190 # include "longlong.h"
 191 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
 192 const unsigned char factor_clz_tab[129] =
 193 {
 194   1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
 195   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 196   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 197   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 198   9
 199 };
 200 # endif
 201
 202 #else /* not USE_LONGLONG_H */
 203
 204 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
 205 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
 206 # define __ll_lowpart(t)  ((uintmax_t) (t) & (__ll_B - 1))
 207 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
 208
 209 #endif
 210
 211 #if !defined __clz_tab && !defined UHWtype
 212 /* Without this seemingly useless conditional, gcc -Wunused-macros
 213    warns that each of the two tested macros is unused on Fedora 18.
 214    FIXME: this is just an ugly band-aid.  Fix it properly.  */
 215 #endif
 216
 217 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
 218 #define MAX_NFACTS 26
 219
 220 enum
 221 {
 222   DEV_DEBUG_OPTION = CHAR_MAX + 1
 223 };
 224
 225 static struct option const long_options[] =
 226 {
 227   {"exponents", no_argument, nullptr, 'h'},
 228   {"-debug", no_argument, nullptr, DEV_DEBUG_OPTION},
 229   {GETOPT_HELP_OPTION_DECL},
 230   {GETOPT_VERSION_OPTION_DECL},
 231   {nullptr, 0, nullptr, 0}
 232 };
 233
 234 /* If true, use p^e output format.  */
 235 static bool print_exponents;
 236
 237 struct factors
 238 {
 239   uintmax_t     plarge[2]; /* Can have a single large factor */
 240   uintmax_t     p[MAX_NFACTS];
 241   unsigned char e[MAX_NFACTS];
 242   unsigned char nfactors;
 243 };
 244
 245 struct mp_factors
 246 {
 247   mpz_t             *p;
 248   unsigned long int *e;
 249   idx_t nfactors;
 250 };
 251
 252 static void factor (uintmax_t, uintmax_t, struct factors *);
 253
 254 #ifndef umul_ppmm
 255 # define umul_ppmm(w1, w0, u, v)                                        \
 256   do {                                                                  \
 257     uintmax_t __x0, __x1, __x2, __x3;                                   \
 258     unsigned long int __ul, __vl, __uh, __vh;                           \
 259     uintmax_t __u = (u), __v = (v);                                     \
 260                                                                         \
 261     __ul = __ll_lowpart (__u);                                          \
 262     __uh = __ll_highpart (__u);                                         \
 263     __vl = __ll_lowpart (__v);                                          \
 264     __vh = __ll_highpart (__v);                                         \
 265                                                                         \
 266     __x0 = (uintmax_t) __ul * __vl;                                     \
 267     __x1 = (uintmax_t) __ul * __vh;                                     \
 268     __x2 = (uintmax_t) __uh * __vl;                                     \
 269     __x3 = (uintmax_t) __uh * __vh;                                     \
 270                                                                         \
 271     __x1 += __ll_highpart (__x0);/* This can't give carry.  */          \
 272     __x1 += __x2;               /* But this indeed can.  */             \
 273     if (__x1 < __x2)            /* Did we get it?  */                   \
 274       __x3 += __ll_B;           /* Yes, add it in the proper pos.  */   \
 275                                                                         \
 276     (w1) = __x3 + __ll_highpart (__x1);                                 \
 277     (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0);             \
 278   } while (0)
 279 #endif
 280
 281 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
 282 /* Define our own, not needing normalization.  This function is
 283    currently not performance critical, so keep it simple.  Similar to
 284    the mod macro below.  */
 285 # undef udiv_qrnnd
 286 # define udiv_qrnnd(q, r, n1, n0, d)                                    \
 287   do {                                                                  \
 288     uintmax_t __d1, __d0, __q, __r1, __r0;                              \
 289                                                                         \
 290     __d1 = (d); __d0 = 0;                                               \
 291     __r1 = (n1); __r0 = (n0);                                           \
 292     affirm (__r1 < __d1);                                               \
 293     __q = 0;                                                            \
 294     for (int __i = W_TYPE_SIZE; __i > 0; __i--)                         \
 295       {                                                                 \
 296         rsh2 (__d1, __d0, __d1, __d0, 1);                               \
 297         __q <<= 1;                                                      \
 298         if (ge2 (__r1, __r0, __d1, __d0))                               \
 299           {                                                             \
 300             __q++;                                                      \
 301             sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0);            \
 302           }                                                             \
 303       }                                                                 \
 304     (r) = __r0;                                                         \
 305     (q) = __q;                                                          \
 306   } while (0)
 307 #endif
 308
 309 #if !defined add_ssaaaa
 310 # define add_ssaaaa(sh, sl, ah, al, bh, bl)                             \
 311   do {                                                                  \
 312     uintmax_t _add_x;                                                   \
 313     _add_x = (al) + (bl);                                               \
 314     (sh) = (ah) + (bh) + (_add_x < (al));                               \
 315     (sl) = _add_x;                                                      \
 316   } while (0)
 317 #endif
 318
 319 #define rsh2(rh, rl, ah, al, cnt)                                       \
 320   do {                                                                  \
 321     (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt));           \
 322     (rh) = (ah) >> (cnt);                                               \
 323   } while (0)
 324
 325 #define lsh2(rh, rl, ah, al, cnt)                                       \
 326   do {                                                                  \
 327     (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt)));             \
 328     (rl) = (al) << (cnt);                                               \
 329   } while (0)
 330
 331 #define ge2(ah, al, bh, bl)                                             \
 332   ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
 333
 334 #define gt2(ah, al, bh, bl)                                             \
 335   ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
 336
 337 #ifndef sub_ddmmss
 338 # define sub_ddmmss(rh, rl, ah, al, bh, bl)                             \
 339   do {                                                                  \
 340     uintmax_t _cy;                                                      \
 341     _cy = (al) < (bl);                                                  \
 342     (rl) = (al) - (bl);                                                 \
 343     (rh) = (ah) - (bh) - _cy;                                           \
 344   } while (0)
 345 #endif
 346
 347 #ifndef count_leading_zeros
 348 # define count_leading_zeros(count, x) do {                             \
 349     uintmax_t __clz_x = (x);                                            \
 350     int __clz_c;                                                        \
 351     for (__clz_c = 0;                                                   \
 352          (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0;      \
 353          __clz_c += 8)                                                  \
 354       __clz_x <<= 8;                                                    \
 355     for (; (intmax_t)__clz_x >= 0; __clz_c++)                           \
 356       __clz_x <<= 1;                                                    \
 357     (count) = __clz_c;                                                  \
 358   } while (0)
 359 #endif
 360
 361 #ifndef count_trailing_zeros
 362 # define count_trailing_zeros(count, x) do {                            \
 363     uintmax_t __ctz_x = (x);                                            \
 364     int __ctz_c = 0;                                                    \
 365     while ((__ctz_x & 1) == 0)                                          \
 366       {                                                                 \
 367         __ctz_x >>= 1;                                                  \
 368         __ctz_c++;                                                      \
 369       }                                                                 \
 370     (count) = __ctz_c;                                                  \
 371   } while (0)
 372 #endif
 373
 374 /* Requires that a < n and b <= n */
 375 #define submod(r,a,b,n)                                                 \
 376   do {                                                                  \
 377     uintmax_t _t = - (uintmax_t) (a < b);                               \
 378     (r) = ((n) & _t) + (a) - (b);                                       \
 379   } while (0)
 380
 381 #define addmod(r,a,b,n)                                                 \
 382   submod ((r), (a), ((n) - (b)), (n))
 383
 384 /* Modular two-word addition and subtraction.  For performance reasons, the
 385    most significant bit of n1 must be clear.  The destination variables must be
 386    distinct from the mod operand.  */
 387 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 388   do {                                                                  \
 389     add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 390     if (ge2 ((r1), (r0), (n1), (n0)))                                   \
 391       sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 392   } while (0)
 393 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0)                         \
 394   do {                                                                  \
 395     sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0));                    \
 396     if ((intmax_t) (r1) < 0)                                            \
 397       add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0));                  \
 398   } while (0)
 399
 400 #define HIGHBIT_TO_MASK(x)                                              \
 401   (((intmax_t)-1 >> 1) < 0                                              \
 402    ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1))                    \
 403    : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1))                        \
 404       ? UINTMAX_MAX : (uintmax_t) 0))
 405
 406 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
 407    Requires that d1 != 0.  */
 408 static uintmax_t
 409 mod2 (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t d1, uintmax_t d0)
 410 {
 411   int cntd, cnta;
 412
 413   affirm (d1 != 0);
 414
 415   if (a1 == 0)
 416     {
 417       *r1 = 0;
 418       return a0;
 419     }
 420
 421   count_leading_zeros (cntd, d1);
 422   count_leading_zeros (cnta, a1);
 423   int cnt = cntd - cnta;
 424   lsh2 (d1, d0, d1, d0, cnt);
 425   for (int i = 0; i < cnt; i++)
 426     {
 427       if (ge2 (a1, a0, d1, d0))
 428         sub_ddmmss (a1, a0, a1, a0, d1, d0);
 429       rsh2 (d1, d0, d1, d0, 1);
 430     }
 431
 432   *r1 = a1;
 433   return a0;
 434 }
 435
 436 ATTRIBUTE_CONST
 437 static uintmax_t
 438 gcd_odd (uintmax_t a, uintmax_t b)
 439 {
 440   if ((b & 1) == 0)
 441     {
 442       uintmax_t t = b;
 443       b = a;
 444       a = t;
 445     }
 446   if (a == 0)
 447     return b;
 448
 449   /* Take out least significant one bit, to make room for sign */
 450   b >>= 1;
 451
 452   for (;;)
 453     {
 454       uintmax_t t;
 455       uintmax_t bgta;
 456
 457       while ((a & 1) == 0)
 458         a >>= 1;
 459       a >>= 1;
 460
 461       t = a - b;
 462       if (t == 0)
 463         return (a << 1) + 1;
 464
 465       bgta = HIGHBIT_TO_MASK (t);
 466
 467       /* b <-- min (a, b) */
 468       b += (bgta & t);
 469
 470       /* a <-- |a - b| */
 471       a = (t ^ bgta) - bgta;
 472     }
 473 }
 474
 475 static uintmax_t
 476 gcd2_odd (uintmax_t *r1, uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0)
 477 {
 478   affirm (b0 & 1);
 479
 480   if ((a0 | a1) == 0)
 481     {
 482       *r1 = b1;
 483       return b0;
 484     }
 485
 486   while ((a0 & 1) == 0)
 487     rsh2 (a1, a0, a1, a0, 1);
 488
 489   for (;;)
 490     {
 491       if ((b1 | a1) == 0)
 492         {
 493           *r1 = 0;
 494           return gcd_odd (b0, a0);
 495         }
 496
 497       if (gt2 (a1, a0, b1, b0))
 498         {
 499           sub_ddmmss (a1, a0, a1, a0, b1, b0);
 500           do
 501             rsh2 (a1, a0, a1, a0, 1);
 502           while ((a0 & 1) == 0);
 503         }
 504       else if (gt2 (b1, b0, a1, a0))
 505         {
 506           sub_ddmmss (b1, b0, b1, b0, a1, a0);
 507           do
 508             rsh2 (b1, b0, b1, b0, 1);
 509           while ((b0 & 1) == 0);
 510         }
 511       else
 512         break;
 513     }
 514
 515   *r1 = a1;
 516   return a0;
 517 }
 518
 519 static void
 520 factor_insert_multiplicity (struct factors *factors,
 521                             uintmax_t prime, int m)
 522 {
 523   int nfactors = factors->nfactors;
 524   uintmax_t *p = factors->p;
 525   unsigned char *e = factors->e;
 526
 527   /* Locate position for insert new or increment e.  */
 528   int i;
 529   for (i = nfactors - 1; i >= 0; i--)
 530     {
 531       if (p[i] <= prime)
 532         break;
 533     }
 534
 535   if (i < 0 || p[i] != prime)
 536     {
 537       for (int j = nfactors - 1; j > i; j--)
 538         {
 539           p[j + 1] = p[j];
 540           e[j + 1] = e[j];
 541         }
 542       p[i + 1] = prime;
 543       e[i + 1] = m;
 544       factors->nfactors = nfactors + 1;
 545     }
 546   else
 547     {
 548       e[i] += m;
 549     }
 550 }
 551
 552 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
 553
 554 static void
 555 factor_insert_large (struct factors *factors,
 556                      uintmax_t p1, uintmax_t p0)
 557 {
 558   if (p1 > 0)
 559     {
 560       affirm (factors->plarge[1] == 0);
 561       factors->plarge[0] = p0;
 562       factors->plarge[1] = p1;
 563     }
 564   else
 565     factor_insert (factors, p0);
 566 }
 567
 568 #ifndef mpz_inits
 569
 570 # include <stdarg.h>
 571
 572 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
 573 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
 574
 575 static void
 576 mpz_va_init (void (*mpz_single_init)(mpz_t), ...)
 577 {
 578   va_list ap;
 579
 580   va_start (ap, mpz_single_init);
 581
 582   mpz_t *mpz;
 583   while ((mpz = va_arg (ap, mpz_t *)))
 584     mpz_single_init (*mpz);
 585
 586   va_end (ap);
 587 }
 588 #endif
 589
 590 static void mp_factor (mpz_t, struct mp_factors *);
 591
 592 static void
 593 mp_factor_init (struct mp_factors *factors)
 594 {
 595   factors->p = nullptr;
 596   factors->e = nullptr;
 597   factors->nfactors = 0;
 598 }
 599
 600 static void
 601 mp_factor_clear (struct mp_factors *factors)
 602 {
 603   for (idx_t i = 0; i < factors->nfactors; i++)
 604     mpz_clear (factors->p[i]);
 605
 606   free (factors->p);
 607   free (factors->e);
 608 }
 609
 610 static void
 611 mp_factor_insert (struct mp_factors *factors, mpz_t prime)
 612 {
 613   idx_t nfactors = factors->nfactors;
 614   mpz_t *p = factors->p;
 615   unsigned long int *e = factors->e;
 616   ptrdiff_t i;
 617
 618   /* Locate position for insert new or increment e.  */
 619   for (i = nfactors - 1; i >= 0; i--)
 620     {
 621       if (mpz_cmp (p[i], prime) <= 0)
 622         break;
 623     }
 624
 625   if (i < 0 || mpz_cmp (p[i], prime) != 0)
 626     {
 627       p = xireallocarray (p, nfactors + 1, sizeof p[0]);
 628       e = xireallocarray (e, nfactors + 1, sizeof e[0]);
 629
 630       mpz_init (p[nfactors]);
 631       for (long j = nfactors - 1; j > i; j--)
 632         {
 633           mpz_set (p[j + 1], p[j]);
 634           e[j + 1] = e[j];
 635         }
 636       mpz_set (p[i + 1], prime);
 637       e[i + 1] = 1;
 638
 639       factors->p = p;
 640       factors->e = e;
 641       factors->nfactors = nfactors + 1;
 642     }
 643   else
 644     {
 645       e[i] += 1;
 646     }
 647 }
 648
 649 static void
 650 mp_factor_insert_ui (struct mp_factors *factors, unsigned long int prime)
 651 {
 652   mpz_t pz;
 653
 654   mpz_init_set_ui (pz, prime);
 655   mp_factor_insert (factors, pz);
 656   mpz_clear (pz);
 657 }
 658
 659
 660 /* Number of bits in an uintmax_t.  */
 661 enum { W = sizeof (uintmax_t) * CHAR_BIT };
 662
 663 /* Verify that uintmax_t does not have holes in its representation.  */
 664 static_assert (UINTMAX_MAX >> (W - 1) == 1);
 665
 666 #define P(a,b,c,d) a,
 667 static const unsigned char primes_diff[] = {
 668 #include "primes.h"
 669 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 670 };
 671 #undef P
 672
 673 #define PRIMES_PTAB_ENTRIES \
 674   (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
 675
 676 #define P(a,b,c,d) b,
 677 static const unsigned char primes_diff8[] = {
 678 #include "primes.h"
 679 0,0,0,0,0,0,0                           /* 7 sentinels for 8-way loop */
 680 };
 681 #undef P
 682
 683 struct primes_dtab
 684 {
 685   uintmax_t binv, lim;
 686 };
 687
 688 #define P(a,b,c,d) {c,d},
 689 static const struct primes_dtab primes_dtab[] = {
 690 #include "primes.h"
 691 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
 692 };
 693 #undef P
 694
 695 /* Verify that uintmax_t is not wider than
 696    the integers used to generate primes.h.  */
 697 static_assert (W <= WIDE_UINT_BITS);
 698
 699 /* debugging for developers.  Enables devmsg().
 700    This flag is used only in the GMP code.  */
 701 static bool dev_debug = false;
 702
 703 /* Prove primality or run probabilistic tests.  */
 704 static bool flag_prove_primality = PROVE_PRIMALITY;
 705
 706 /* Number of Miller-Rabin tests to run when not proving primality.  */
 707 #define MR_REPS 25
 708
 709 static void
 710 factor_insert_refind (struct factors *factors, uintmax_t p, int i, int off)
 711 {
 712   for (int j = 0; j < off; j++)
 713     p += primes_diff[i + j];
 714   factor_insert (factors, p);
 715 }
 716
 717 /* Trial division with odd primes uses the following trick.
 718
 719    Let p be an odd prime, and B = 2^{W_TYPE_SIZE}.  For simplicity,
 720    consider the case t < B (this is the second loop below).
 721
 722    From our tables we get
 723
 724      binv = p^{-1} (mod B)
 725      lim = floor ((B-1) / p).
 726
 727    First assume that t is a multiple of p, t = q * p.  Then 0 <= q <= lim
 728    (and all quotients in this range occur for some t).
 729
 730    Then t = q * p is true also (mod B), and p is invertible we get
 731
 732      q = t * binv (mod B).
 733
 734    Next, assume that t is *not* divisible by p.  Since multiplication
 735    by binv (mod B) is a one-to-one mapping,
 736
 737      t * binv (mod B) > lim,
 738
 739    because all the smaller values are already taken.
 740
 741    This can be summed up by saying that the function
 742
 743      q(t) = binv * t (mod B)
 744
 745    is a permutation of the range 0 <= t < B, with the curious property
 746    that it maps the multiples of p onto the range 0 <= q <= lim, in
 747    order, and the non-multiples of p onto the range lim < q < B.
 748  */
 749
 750 static uintmax_t
 751 factor_using_division (uintmax_t *t1p, uintmax_t t1, uintmax_t t0,
 752                        struct factors *factors)
 753 {
 754   if (t0 % 2 == 0)
 755     {
 756       int cnt;
 757
 758       if (t0 == 0)
 759         {
 760           count_trailing_zeros (cnt, t1);
 761           t0 = t1 >> cnt;
 762           t1 = 0;
 763           cnt += W_TYPE_SIZE;
 764         }
 765       else
 766         {
 767           count_trailing_zeros (cnt, t0);
 768           rsh2 (t1, t0, t1, t0, cnt);
 769         }
 770
 771       factor_insert_multiplicity (factors, 2, cnt);
 772     }
 773
 774   uintmax_t p = 3;
 775   idx_t i;
 776   for (i = 0; t1 > 0 && i < PRIMES_PTAB_ENTRIES; i++)
 777     {
 778       for (;;)
 779         {
 780           uintmax_t q1, q0, hi;
 781           MAYBE_UNUSED uintmax_t lo;
 782
 783           q0 = t0 * primes_dtab[i].binv;
 784           umul_ppmm (hi, lo, q0, p);
 785           if (hi > t1)
 786             break;
 787           hi = t1 - hi;
 788           q1 = hi * primes_dtab[i].binv;
 789           if (LIKELY (q1 > primes_dtab[i].lim))
 790             break;
 791           t1 = q1; t0 = q0;
 792           factor_insert (factors, p);
 793         }
 794       p += primes_diff[i + 1];
 795     }
 796   if (t1p)
 797     *t1p = t1;
 798
 799 #define DIVBLOCK(I)                                                     \
 800   do {                                                                  \
 801     for (;;)                                                            \
 802       {                                                                 \
 803         q = t0 * pd[I].binv;                                            \
 804         if (LIKELY (q > pd[I].lim))                                     \
 805           break;                                                        \
 806         t0 = q;                                                         \
 807         factor_insert_refind (factors, p, i + 1, I);                    \
 808       }                                                                 \
 809   } while (0)
 810
 811   for (; i < PRIMES_PTAB_ENTRIES; i += 8)
 812     {
 813       uintmax_t q;
 814       const struct primes_dtab *pd = &primes_dtab[i];
 815       DIVBLOCK (0);
 816       DIVBLOCK (1);
 817       DIVBLOCK (2);
 818       DIVBLOCK (3);
 819       DIVBLOCK (4);
 820       DIVBLOCK (5);
 821       DIVBLOCK (6);
 822       DIVBLOCK (7);
 823
 824       p += primes_diff8[i];
 825       if (p * p > t0)
 826         break;
 827     }
 828
 829   return t0;
 830 }
 831
 832 static void
 833 mp_factor_using_division (mpz_t t, struct mp_factors *factors)
 834 {
 835   mpz_t q;
 836   mp_bitcnt_t p;
 837
 838   devmsg ("[trial division] ");
 839
 840   mpz_init (q);
 841
 842   p = mpz_scan1 (t, 0);
 843   mpz_fdiv_q_2exp (t, t, p);
 844   while (p)
 845     {
 846       mp_factor_insert_ui (factors, 2);
 847       --p;
 848     }
 849
 850   unsigned long int d = 3;
 851   for (idx_t i = 1; i <= PRIMES_PTAB_ENTRIES;)
 852     {
 853       if (! mpz_divisible_ui_p (t, d))
 854         {
 855           d += primes_diff[i++];
 856           if (mpz_cmp_ui (t, d * d) < 0)
 857             break;
 858         }
 859       else
 860         {
 861           mpz_tdiv_q_ui (t, t, d);
 862           mp_factor_insert_ui (factors, d);
 863         }
 864     }
 865
 866   mpz_clear (q);
 867 }
 868
 869 /* Entry i contains (2i+1)^(-1) mod 2^8.  */
 870 static const unsigned char  binvert_table[128] =
 871 {
 872   0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
 873   0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
 874   0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
 875   0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
 876   0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
 877   0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
 878   0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
 879   0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
 880   0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
 881   0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
 882   0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
 883   0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
 884   0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
 885   0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
 886   0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
 887   0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
 888 };
 889
 890 /* Compute n^(-1) mod B, using a Newton iteration.  */
 891 #define binv(inv,n)                                                     \
 892   do {                                                                  \
 893     uintmax_t  __n = (n);                                               \
 894     uintmax_t  __inv;                                                   \
 895                                                                         \
 896     __inv = binvert_table[(__n / 2) & 0x7F]; /*  8 */                   \
 897     if (W_TYPE_SIZE > 8)   __inv = 2 * __inv - __inv * __inv * __n;     \
 898     if (W_TYPE_SIZE > 16)  __inv = 2 * __inv - __inv * __inv * __n;     \
 899     if (W_TYPE_SIZE > 32)  __inv = 2 * __inv - __inv * __inv * __n;     \
 900                                                                         \
 901     if (W_TYPE_SIZE > 64)                                               \
 902       {                                                                 \
 903         int  __invbits = 64;                                            \
 904         do {                                                            \
 905           __inv = 2 * __inv - __inv * __inv * __n;                      \
 906           __invbits *= 2;                                               \
 907         } while (__invbits < W_TYPE_SIZE);                              \
 908       }                                                                 \
 909                                                                         \
 910     (inv) = __inv;                                                      \
 911   } while (0)
 912
 913 /* q = u / d, assuming d|u.  */
 914 #define divexact_21(q1, q0, u1, u0, d)                                  \
 915   do {                                                                  \
 916     uintmax_t _di, _q0;                                                 \
 917     binv (_di, (d));                                                    \
 918     _q0 = (u0) * _di;                                                   \
 919     if ((u1) >= (d))                                                    \
 920       {                                                                 \
 921         uintmax_t _p1;                                                  \
 922         MAYBE_UNUSED intmax_t _p0;                                      \
 923         umul_ppmm (_p1, _p0, _q0, d);                                   \
 924         (q1) = ((u1) - _p1) * _di;                                      \
 925         (q0) = _q0;                                                     \
 926       }                                                                 \
 927     else                                                                \
 928       {                                                                 \
 929         (q0) = _q0;                                                     \
 930         (q1) = 0;                                                       \
 931       }                                                                 \
 932   } while (0)
 933
 934 /* x B (mod n).  */
 935 #define redcify(r_prim, r, n)                                           \
 936   do {                                                                  \
 937     MAYBE_UNUSED uintmax_t _redcify_q;                                  \
 938     udiv_qrnnd (_redcify_q, r_prim, r, 0, n);                           \
 939   } while (0)
 940
 941 /* x B^2 (mod n).  Requires x > 0, n1 < B/2.  */
 942 #define redcify2(r1, r0, x, n1, n0)                                     \
 943   do {                                                                  \
 944     uintmax_t _r1, _r0, _i;                                             \
 945     if ((x) < (n1))                                                     \
 946       {                                                                 \
 947         _r1 = (x); _r0 = 0;                                             \
 948         _i = W_TYPE_SIZE;                                               \
 949       }                                                                 \
 950     else                                                                \
 951       {                                                                 \
 952         _r1 = 0; _r0 = (x);                                             \
 953         _i = 2 * W_TYPE_SIZE;                                           \
 954       }                                                                 \
 955     while (_i-- > 0)                                                    \
 956       {                                                                 \
 957         lsh2 (_r1, _r0, _r1, _r0, 1);                                   \
 958         if (ge2 (_r1, _r0, (n1), (n0)))                                 \
 959           sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0));                  \
 960       }                                                                 \
 961     (r1) = _r1;                                                         \
 962     (r0) = _r0;                                                         \
 963   } while (0)
 964
 965 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 966    Both a and b must be in redc form, the result will be in redc form too.  */
 967 static inline uintmax_t
 968 mulredc (uintmax_t a, uintmax_t b, uintmax_t m, uintmax_t mi)
 969 {
 970   uintmax_t rh, rl, q, th, xh;
 971   MAYBE_UNUSED uintmax_t tl;
 972
 973   umul_ppmm (rh, rl, a, b);
 974   q = rl * mi;
 975   umul_ppmm (th, tl, q, m);
 976   xh = rh - th;
 977   if (rh < th)
 978     xh += m;
 979
 980   return xh;
 981 }
 982
 983 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
 984    Both a and b must be in redc form, the result will be in redc form too.
 985    For performance reasons, the most significant bit of m must be clear.  */
 986 static uintmax_t
 987 mulredc2 (uintmax_t *r1p,
 988           uintmax_t a1, uintmax_t a0, uintmax_t b1, uintmax_t b0,
 989           uintmax_t m1, uintmax_t m0, uintmax_t mi)
 990 {
 991   uintmax_t r1, r0, q, p1, t1, t0, s1, s0;
 992   MAYBE_UNUSED uintmax_t p0;
 993   mi = -mi;
 994   affirm ((a1 >> (W_TYPE_SIZE - 1)) == 0);
 995   affirm ((b1 >> (W_TYPE_SIZE - 1)) == 0);
 996   affirm ((m1 >> (W_TYPE_SIZE - 1)) == 0);
 997
 998   /* First compute a0 * <b1, b0> B^{-1}
 999         +-----+
1000         |a0 b0|
1001      +--+--+--+
1002      |a0 b1|
1003      +--+--+--+
1004         |q0 m0|
1005      +--+--+--+
1006      |q0 m1|
1007     -+--+--+--+
1008      |r1|r0| 0|
1009      +--+--+--+
1010   */
1011   umul_ppmm (t1, t0, a0, b0);
1012   umul_ppmm (r1, r0, a0, b1);
1013   q = mi * t0;
1014   umul_ppmm (p1, p0, q, m0);
1015   umul_ppmm (s1, s0, q, m1);
1016   r0 += (t0 != 0); /* Carry */
1017   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1018   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1019   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1020
1021   /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1022         +-----+
1023         |a1 b0|
1024         +--+--+
1025         |r1|r0|
1026      +--+--+--+
1027      |a1 b1|
1028      +--+--+--+
1029         |q1 m0|
1030      +--+--+--+
1031      |q1 m1|
1032     -+--+--+--+
1033      |r1|r0| 0|
1034      +--+--+--+
1035   */
1036   umul_ppmm (t1, t0, a1, b0);
1037   umul_ppmm (s1, s0, a1, b1);
1038   add_ssaaaa (t1, t0, t1, t0, 0, r0);
1039   q = mi * t0;
1040   add_ssaaaa (r1, r0, s1, s0, 0, r1);
1041   umul_ppmm (p1, p0, q, m0);
1042   umul_ppmm (s1, s0, q, m1);
1043   r0 += (t0 != 0); /* Carry */
1044   add_ssaaaa (r1, r0, r1, r0, 0, p1);
1045   add_ssaaaa (r1, r0, r1, r0, 0, t1);
1046   add_ssaaaa (r1, r0, r1, r0, s1, s0);
1047
1048   if (ge2 (r1, r0, m1, m0))
1049     sub_ddmmss (r1, r0, r1, r0, m1, m0);
1050
1051   *r1p = r1;
1052   return r0;
1053 }
1054
1055 ATTRIBUTE_CONST
1056 static uintmax_t
1057 powm (uintmax_t b, uintmax_t e, uintmax_t n, uintmax_t ni, uintmax_t one)
1058 {
1059   uintmax_t y = one;
1060
1061   if (e & 1)
1062     y = b;
1063
1064   while (e != 0)
1065     {
1066       b = mulredc (b, b, n, ni);
1067       e >>= 1;
1068
1069       if (e & 1)
1070         y = mulredc (y, b, n, ni);
1071     }
1072
1073   return y;
1074 }
1075
1076 static uintmax_t
1077 powm2 (uintmax_t *r1m,
1078        const uintmax_t *bp, const uintmax_t *ep, const uintmax_t *np,
1079        uintmax_t ni, const uintmax_t *one)
1080 {
1081   uintmax_t r1, r0, b1, b0, n1, n0;
1082   int i;
1083   uintmax_t e;
1084
1085   b0 = bp[0];
1086   b1 = bp[1];
1087   n0 = np[0];
1088   n1 = np[1];
1089
1090   r0 = one[0];
1091   r1 = one[1];
1092
1093   for (e = ep[0], i = W_TYPE_SIZE; i > 0; i--, e >>= 1)
1094     {
1095       if (e & 1)
1096         {
1097           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1098           r1 = *r1m;
1099         }
1100       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1101       b1 = *r1m;
1102     }
1103   for (e = ep[1]; e > 0; e >>= 1)
1104     {
1105       if (e & 1)
1106         {
1107           r0 = mulredc2 (r1m, r1, r0, b1, b0, n1, n0, ni);
1108           r1 = *r1m;
1109         }
1110       b0 = mulredc2 (r1m, b1, b0, b1, b0, n1, n0, ni);
1111       b1 = *r1m;
1112     }
1113   *r1m = r1;
1114   return r0;
1115 }
1116
1117 ATTRIBUTE_CONST
1118 static bool
1119 millerrabin (uintmax_t n, uintmax_t ni, uintmax_t b, uintmax_t q,
1120              int k, uintmax_t one)
1121 {
1122   uintmax_t y = powm (b, q, n, ni, one);
1123
1124   uintmax_t nm1 = n - one;      /* -1, but in redc representation.  */
1125
1126   if (y == one || y == nm1)
1127     return true;
1128
1129   for (int i = 1; i < k; i++)
1130     {
1131       y = mulredc (y, y, n, ni);
1132
1133       if (y == nm1)
1134         return true;
1135       if (y == one)
1136         return false;
1137     }
1138   return false;
1139 }
1140
1141 ATTRIBUTE_PURE static bool
1142 millerrabin2 (const uintmax_t *np, uintmax_t ni, const uintmax_t *bp,
1143               const uintmax_t *qp, int k, const uintmax_t *one)
1144 {
1145   uintmax_t y1, y0, nm1_1, nm1_0, r1m;
1146
1147   y0 = powm2 (&r1m, bp, qp, np, ni, one);
1148   y1 = r1m;
1149
1150   if (y0 == one[0] && y1 == one[1])
1151     return true;
1152
1153   sub_ddmmss (nm1_1, nm1_0, np[1], np[0], one[1], one[0]);
1154
1155   if (y0 == nm1_0 && y1 == nm1_1)
1156     return true;
1157
1158   for (int i = 1; i < k; i++)
1159     {
1160       y0 = mulredc2 (&r1m, y1, y0, y1, y0, np[1], np[0], ni);
1161       y1 = r1m;
1162
1163       if (y0 == nm1_0 && y1 == nm1_1)
1164         return true;
1165       if (y0 == one[0] && y1 == one[1])
1166         return false;
1167     }
1168   return false;
1169 }
1170
1171 static bool
1172 mp_millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
1173                 mpz_srcptr q, mp_bitcnt_t k)
1174 {
1175   mpz_powm (y, x, q, n);
1176
1177   if (mpz_cmp_ui (y, 1) == 0 || mpz_cmp (y, nm1) == 0)
1178     return true;
1179
1180   for (mp_bitcnt_t i = 1; i < k; i++)
1181     {
1182       mpz_powm_ui (y, y, 2, n);
1183       if (mpz_cmp (y, nm1) == 0)
1184         return true;
1185       if (mpz_cmp_ui (y, 1) == 0)
1186         return false;
1187     }
1188   return false;
1189 }
1190
1191 /* Lucas' prime test.  The number of iterations vary greatly, up to a few dozen
1192    have been observed.  The average seem to be about 2.  */
1193 static bool ATTRIBUTE_PURE
1194 prime_p (uintmax_t n)
1195 {
1196   mp_bitcnt_t k;
1197   bool is_prime;
1198   uintmax_t a_prim, one, ni;
1199   struct factors factors;
1200
1201   if (n <= 1)
1202     return false;
1203
1204   /* We have already casted out small primes.  */
1205   if (n < (uintmax_t) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME)
1206     return true;
1207
1208   /* Precomputation for Miller-Rabin.  */
1209   uintmax_t q = n - 1;
1210   for (k = 0; (q & 1) == 0; k++)
1211     q >>= 1;
1212
1213   uintmax_t a = 2;
1214   binv (ni, n);                 /* ni <- 1/n mod B */
1215   redcify (one, 1, n);
1216   addmod (a_prim, one, one, n); /* i.e., redcify a = 2 */
1217
1218   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1219   if (!millerrabin (n, ni, a_prim, q, k, one))
1220     return false;
1221
1222   if (flag_prove_primality)
1223     {
1224       /* Factor n-1 for Lucas.  */
1225       factor (0, n - 1, &factors);
1226     }
1227
1228   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1229      number composite.  */
1230   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1231     {
1232       if (flag_prove_primality)
1233         {
1234           is_prime = true;
1235           for (int i = 0; i < factors.nfactors && is_prime; i++)
1236             {
1237               is_prime
1238                 = powm (a_prim, (n - 1) / factors.p[i], n, ni, one) != one;
1239             }
1240         }
1241       else
1242         {
1243           /* After enough Miller-Rabin runs, be content.  */
1244           is_prime = (r == MR_REPS - 1);
1245         }
1246
1247       if (is_prime)
1248         return true;
1249
1250       a += primes_diff[r];      /* Establish new base.  */
1251
1252       /* The following is equivalent to redcify (a_prim, a, n).  It runs faster
1253          on most processors, since it avoids udiv_qrnnd.  If we go down the
1254          udiv_qrnnd_preinv path, this code should be replaced.  */
1255       {
1256         uintmax_t s1, s0;
1257         umul_ppmm (s1, s0, one, a);
1258         if (LIKELY (s1 == 0))
1259           a_prim = s0 % n;
1260         else
1261           {
1262             MAYBE_UNUSED uintmax_t dummy;
1263             udiv_qrnnd (dummy, a_prim, s1, s0, n);
1264           }
1265       }
1266
1267       if (!millerrabin (n, ni, a_prim, q, k, one))
1268         return false;
1269     }
1270
1271   affirm (!"Lucas prime test failure.  This should not happen");
1272 }
1273
1274 static bool ATTRIBUTE_PURE
1275 prime2_p (uintmax_t n1, uintmax_t n0)
1276 {
1277   uintmax_t q[2], nm1[2];
1278   uintmax_t a_prim[2];
1279   uintmax_t one[2];
1280   uintmax_t na[2];
1281   uintmax_t ni;
1282   int k;
1283   struct factors factors;
1284
1285   if (n1 == 0)
1286     return prime_p (n0);
1287
1288   nm1[1] = n1 - (n0 == 0);
1289   nm1[0] = n0 - 1;
1290   if (nm1[0] == 0)
1291     {
1292       count_trailing_zeros (k, nm1[1]);
1293
1294       q[0] = nm1[1] >> k;
1295       q[1] = 0;
1296       k += W_TYPE_SIZE;
1297     }
1298   else
1299     {
1300       count_trailing_zeros (k, nm1[0]);
1301       rsh2 (q[1], q[0], nm1[1], nm1[0], k);
1302     }
1303
1304   uintmax_t a = 2;
1305   binv (ni, n0);
1306   redcify2 (one[1], one[0], 1, n1, n0);
1307   addmod2 (a_prim[1], a_prim[0], one[1], one[0], one[1], one[0], n1, n0);
1308
1309   /* FIXME: Use scalars or pointers in arguments?  Some consistency needed.  */
1310   na[0] = n0;
1311   na[1] = n1;
1312
1313   if (!millerrabin2 (na, ni, a_prim, q, k, one))
1314     return false;
1315
1316   if (flag_prove_primality)
1317     {
1318       /* Factor n-1 for Lucas.  */
1319       factor (nm1[1], nm1[0], &factors);
1320     }
1321
1322   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323      number composite.  */
1324   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1325     {
1326       bool is_prime;
1327       uintmax_t e[2], y[2];
1328
1329       if (flag_prove_primality)
1330         {
1331           is_prime = true;
1332           if (factors.plarge[1])
1333             {
1334               uintmax_t pi;
1335               binv (pi, factors.plarge[0]);
1336               e[0] = pi * nm1[0];
1337               e[1] = 0;
1338               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1339               is_prime = (y[0] != one[0] || y[1] != one[1]);
1340             }
1341           for (int i = 0; i < factors.nfactors && is_prime; i++)
1342             {
1343               /* FIXME: We always have the factor 2.  Do we really need to
1344                  handle it here?  We have done the same powering as part
1345                  of millerrabin.  */
1346               if (factors.p[i] == 2)
1347                 rsh2 (e[1], e[0], nm1[1], nm1[0], 1);
1348               else
1349                 divexact_21 (e[1], e[0], nm1[1], nm1[0], factors.p[i]);
1350               y[0] = powm2 (&y[1], a_prim, e, na, ni, one);
1351               is_prime = (y[0] != one[0] || y[1] != one[1]);
1352             }
1353         }
1354       else
1355         {
1356           /* After enough Miller-Rabin runs, be content.  */
1357           is_prime = (r == MR_REPS - 1);
1358         }
1359
1360       if (is_prime)
1361         return true;
1362
1363       a += primes_diff[r];      /* Establish new base.  */
1364       redcify2 (a_prim[1], a_prim[0], a, n1, n0);
1365
1366       if (!millerrabin2 (na, ni, a_prim, q, k, one))
1367         return false;
1368     }
1369
1370   affirm (!"Lucas prime test failure.  This should not happen");
1371 }
1372
1373 static bool
1374 mp_prime_p (mpz_t n)
1375 {
1376   bool is_prime;
1377   mpz_t q, a, nm1, tmp;
1378   struct mp_factors factors;
1379
1380   if (mpz_cmp_ui (n, 1) <= 0)
1381     return false;
1382
1383   /* We have already casted out small primes.  */
1384   if (mpz_cmp_ui (n, (long) FIRST_OMITTED_PRIME * FIRST_OMITTED_PRIME) < 0)
1385     return true;
1386
1387   mpz_inits (q, a, nm1, tmp, nullptr);
1388
1389   /* Precomputation for Miller-Rabin.  */
1390   mpz_sub_ui (nm1, n, 1);
1391
1392   /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
1393   mp_bitcnt_t k = mpz_scan1 (nm1, 0);
1394   mpz_tdiv_q_2exp (q, nm1, k);
1395
1396   mpz_set_ui (a, 2);
1397
1398   /* Perform a Miller-Rabin test, finds most composites quickly.  */
1399   if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1400     {
1401       is_prime = false;
1402       goto ret2;
1403     }
1404
1405   if (flag_prove_primality)
1406     {
1407       /* Factor n-1 for Lucas.  */
1408       mpz_set (tmp, nm1);
1409       mp_factor (tmp, &factors);
1410     }
1411
1412   /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1413      number composite.  */
1414   for (idx_t r = 0; r < PRIMES_PTAB_ENTRIES; r++)
1415     {
1416       if (flag_prove_primality)
1417         {
1418           is_prime = true;
1419           for (idx_t i = 0; i < factors.nfactors && is_prime; i++)
1420             {
1421               mpz_divexact (tmp, nm1, factors.p[i]);
1422               mpz_powm (tmp, a, tmp, n);
1423               is_prime = mpz_cmp_ui (tmp, 1) != 0;
1424             }
1425         }
1426       else
1427         {
1428           /* After enough Miller-Rabin runs, be content.  */
1429           is_prime = (r == MR_REPS - 1);
1430         }
1431
1432       if (is_prime)
1433         goto ret1;
1434
1435       mpz_add_ui (a, a, primes_diff[r]);        /* Establish new base.  */
1436
1437       if (!mp_millerrabin (n, nm1, a, tmp, q, k))
1438         {
1439           is_prime = false;
1440           goto ret1;
1441         }
1442     }
1443
1444   affirm (!"Lucas prime test failure.  This should not happen");
1445
1446  ret1:
1447   if (flag_prove_primality)
1448     mp_factor_clear (&factors);
1449  ret2:
1450   mpz_clears (q, a, nm1, tmp, nullptr);
1451
1452   return is_prime;
1453 }
1454
1455 static void
1456 factor_using_pollard_rho (uintmax_t n, unsigned long int a,
1457                           struct factors *factors)
1458 {
1459   uintmax_t x, z, y, P, t, ni, g;
1460
1461   unsigned long int k = 1;
1462   unsigned long int l = 1;
1463
1464   redcify (P, 1, n);
1465   addmod (x, P, P, n);          /* i.e., redcify(2) */
1466   y = z = x;
1467
1468   while (n != 1)
1469     {
1470       affirm (a < n);
1471
1472       binv (ni, n);             /* FIXME: when could we use old 'ni' value?  */
1473
1474       for (;;)
1475         {
1476           do
1477             {
1478               x = mulredc (x, x, n, ni);
1479               addmod (x, x, a, n);
1480
1481               submod (t, z, x, n);
1482               P = mulredc (P, t, n, ni);
1483
1484               if (k % 32 == 1)
1485                 {
1486                   if (gcd_odd (P, n) != 1)
1487                     goto factor_found;
1488                   y = x;
1489                 }
1490             }
1491           while (--k != 0);
1492
1493           z = x;
1494           k = l;
1495           l = 2 * l;
1496           for (unsigned long int i = 0; i < k; i++)
1497             {
1498               x = mulredc (x, x, n, ni);
1499               addmod (x, x, a, n);
1500             }
1501           y = x;
1502         }
1503
1504     factor_found:
1505       do
1506         {
1507           y = mulredc (y, y, n, ni);
1508           addmod (y, y, a, n);
1509
1510           submod (t, z, y, n);
1511           g = gcd_odd (t, n);
1512         }
1513       while (g == 1);
1514
1515       if (n == g)
1516         {
1517           /* Found n itself as factor.  Restart with different params.  */
1518           factor_using_pollard_rho (n, a + 1, factors);
1519           return;
1520         }
1521
1522       n = n / g;
1523
1524       if (!prime_p (g))
1525         factor_using_pollard_rho (g, a + 1, factors);
1526       else
1527         factor_insert (factors, g);
1528
1529       if (prime_p (n))
1530         {
1531           factor_insert (factors, n);
1532           break;
1533         }
1534
1535       x = x % n;
1536       z = z % n;
1537       y = y % n;
1538     }
1539 }
1540
1541 static void
1542 factor_using_pollard_rho2 (uintmax_t n1, uintmax_t n0, unsigned long int a,
1543                            struct factors *factors)
1544 {
1545   uintmax_t x1, x0, z1, z0, y1, y0, P1, P0, t1, t0, ni, g1, g0, r1m;
1546
1547   unsigned long int k = 1;
1548   unsigned long int l = 1;
1549
1550   redcify2 (P1, P0, 1, n1, n0);
1551   addmod2 (x1, x0, P1, P0, P1, P0, n1, n0); /* i.e., redcify(2) */
1552   y1 = z1 = x1;
1553   y0 = z0 = x0;
1554
1555   while (n1 != 0 || n0 != 1)
1556     {
1557       binv (ni, n0);
1558
1559       for (;;)
1560         {
1561           do
1562             {
1563               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1564               x1 = r1m;
1565               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1566
1567               submod2 (t1, t0, z1, z0, x1, x0, n1, n0);
1568               P0 = mulredc2 (&r1m, P1, P0, t1, t0, n1, n0, ni);
1569               P1 = r1m;
1570
1571               if (k % 32 == 1)
1572                 {
1573                   g0 = gcd2_odd (&g1, P1, P0, n1, n0);
1574                   if (g1 != 0 || g0 != 1)
1575                     goto factor_found;
1576                   y1 = x1; y0 = x0;
1577                 }
1578             }
1579           while (--k != 0);
1580
1581           z1 = x1; z0 = x0;
1582           k = l;
1583           l = 2 * l;
1584           for (unsigned long int i = 0; i < k; i++)
1585             {
1586               x0 = mulredc2 (&r1m, x1, x0, x1, x0, n1, n0, ni);
1587               x1 = r1m;
1588               addmod2 (x1, x0, x1, x0, 0, (uintmax_t) a, n1, n0);
1589             }
1590           y1 = x1; y0 = x0;
1591         }
1592
1593     factor_found:
1594       do
1595         {
1596           y0 = mulredc2 (&r1m, y1, y0, y1, y0, n1, n0, ni);
1597           y1 = r1m;
1598           addmod2 (y1, y0, y1, y0, 0, (uintmax_t) a, n1, n0);
1599
1600           submod2 (t1, t0, z1, z0, y1, y0, n1, n0);
1601           g0 = gcd2_odd (&g1, t1, t0, n1, n0);
1602         }
1603       while (g1 == 0 && g0 == 1);
1604
1605       if (g1 == 0)
1606         {
1607           /* The found factor is one word, and > 1.  */
1608           divexact_21 (n1, n0, n1, n0, g0);     /* n = n / g */
1609
1610           if (!prime_p (g0))
1611             factor_using_pollard_rho (g0, a + 1, factors);
1612           else
1613             factor_insert (factors, g0);
1614         }
1615       else
1616         {
1617           /* The found factor is two words.  This is highly unlikely, thus hard
1618              to trigger.  Please be careful before you change this code!  */
1619           uintmax_t ginv;
1620
1621           if (n1 == g1 && n0 == g0)
1622             {
1623               /* Found n itself as factor.  Restart with different params.  */
1624               factor_using_pollard_rho2 (n1, n0, a + 1, factors);
1625               return;
1626             }
1627
1628           /* Compute n = n / g.  Since the result will fit one word,
1629              we can compute the quotient modulo B, ignoring the high
1630              divisor word.  */
1631           binv (ginv, g0);
1632           n0 = ginv * n0;
1633           n1 = 0;
1634
1635           if (!prime2_p (g1, g0))
1636             factor_using_pollard_rho2 (g1, g0, a + 1, factors);
1637           else
1638             factor_insert_large (factors, g1, g0);
1639         }
1640
1641       if (n1 == 0)
1642         {
1643           if (prime_p (n0))
1644             {
1645               factor_insert (factors, n0);
1646               break;
1647             }
1648
1649           factor_using_pollard_rho (n0, a, factors);
1650           return;
1651         }
1652
1653       if (prime2_p (n1, n0))
1654         {
1655           factor_insert_large (factors, n1, n0);
1656           break;
1657         }
1658
1659       x0 = mod2 (&x1, x1, x0, n1, n0);
1660       z0 = mod2 (&z1, z1, z0, n1, n0);
1661       y0 = mod2 (&y1, y1, y0, n1, n0);
1662     }
1663 }
1664
1665 static void
1666 mp_factor_using_pollard_rho (mpz_t n, unsigned long int a,
1667                              struct mp_factors *factors)
1668 {
1669   mpz_t x, z, y, P;
1670   mpz_t t, t2;
1671
1672   devmsg ("[pollard-rho (%lu)] ", a);
1673
1674   mpz_inits (t, t2, nullptr);
1675   mpz_init_set_si (y, 2);
1676   mpz_init_set_si (x, 2);
1677   mpz_init_set_si (z, 2);
1678   mpz_init_set_ui (P, 1);
1679
1680   unsigned long long int k = 1;
1681   unsigned long long int l = 1;
1682
1683   while (mpz_cmp_ui (n, 1) != 0)
1684     {
1685       for (;;)
1686         {
1687           do
1688             {
1689               mpz_mul (t, x, x);
1690               mpz_mod (x, t, n);
1691               mpz_add_ui (x, x, a);
1692
1693               mpz_sub (t, z, x);
1694               mpz_mul (t2, P, t);
1695               mpz_mod (P, t2, n);
1696
1697               if (k % 32 == 1)
1698                 {
1699                   mpz_gcd (t, P, n);
1700                   if (mpz_cmp_ui (t, 1) != 0)
1701                     goto factor_found;
1702                   mpz_set (y, x);
1703                 }
1704             }
1705           while (--k != 0);
1706
1707           mpz_set (z, x);
1708           k = l;
1709           l = 2 * l;
1710           for (unsigned long long int i = 0; i < k; i++)
1711             {
1712               mpz_mul (t, x, x);
1713               mpz_mod (x, t, n);
1714               mpz_add_ui (x, x, a);
1715             }
1716           mpz_set (y, x);
1717         }
1718
1719     factor_found:
1720       do
1721         {
1722           mpz_mul (t, y, y);
1723           mpz_mod (y, t, n);
1724           mpz_add_ui (y, y, a);
1725
1726           mpz_sub (t, z, y);
1727           mpz_gcd (t, t, n);
1728         }
1729       while (mpz_cmp_ui (t, 1) == 0);
1730
1731       mpz_divexact (n, n, t);   /* divide by t, before t is overwritten */
1732
1733       if (!mp_prime_p (t))
1734         {
1735           devmsg ("[composite factor--restarting pollard-rho] ");
1736           mp_factor_using_pollard_rho (t, a + 1, factors);
1737         }
1738       else
1739         {
1740           mp_factor_insert (factors, t);
1741         }
1742
1743       if (mp_prime_p (n))
1744         {
1745           mp_factor_insert (factors, n);
1746           break;
1747         }
1748
1749       mpz_mod (x, x, n);
1750       mpz_mod (z, z, n);
1751       mpz_mod (y, y, n);
1752     }
1753
1754   mpz_clears (P, t2, t, z, x, y, nullptr);
1755 }
1756
1757 #if USE_SQUFOF
1758 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)?  If
1759    algorithm is replaced, consider also returning the remainder.  */
1760 ATTRIBUTE_CONST
1761 static uintmax_t
1762 isqrt (uintmax_t n)
1763 {
1764   uintmax_t x;
1765   int c;
1766   if (n == 0)
1767     return 0;
1768
1769   count_leading_zeros (c, n);
1770
1771   /* Make x > sqrt(n).  This will be invariant through the loop.  */
1772   x = (uintmax_t) 1 << ((W_TYPE_SIZE + 1 - c) >> 1);
1773
1774   for (;;)
1775     {
1776       uintmax_t y = (x + n / x) / 2;
1777       if (y >= x)
1778         return x;
1779
1780       x = y;
1781     }
1782 }
1783
1784 ATTRIBUTE_CONST
1785 static uintmax_t
1786 isqrt2 (uintmax_t nh, uintmax_t nl)
1787 {
1788   int shift;
1789   uintmax_t x;
1790
1791   /* Ensures the remainder fits in an uintmax_t.  */
1792   affirm (nh < ((uintmax_t) 1 << (W_TYPE_SIZE - 2)));
1793
1794   if (nh == 0)
1795     return isqrt (nl);
1796
1797   count_leading_zeros (shift, nh);
1798   shift &= ~1;
1799
1800   /* Make x > sqrt (n).  */
1801   x = isqrt ((nh << shift) + (nl >> (W_TYPE_SIZE - shift))) + 1;
1802   x <<= (W_TYPE_SIZE - shift) >> 1;
1803
1804   /* Do we need more than one iteration?  */
1805   for (;;)
1806     {
1807       MAYBE_UNUSED uintmax_t r;
1808       uintmax_t q, y;
1809       udiv_qrnnd (q, r, nh, nl, x);
1810       y = (x + q) / 2;
1811
1812       if (y >= x)
1813         {
1814           uintmax_t hi, lo;
1815           umul_ppmm (hi, lo, x + 1, x + 1);
1816           affirm (gt2 (hi, lo, nh, nl));
1817
1818           umul_ppmm (hi, lo, x, x);
1819           affirm (ge2 (nh, nl, hi, lo));
1820           sub_ddmmss (hi, lo, nh, nl, hi, lo);
1821           affirm (hi == 0);
1822
1823           return x;
1824         }
1825
1826       x = y;
1827     }
1828 }
1829
1830 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N.  */
1831 # define MAGIC64 0x0202021202030213ULL
1832 # define MAGIC63 0x0402483012450293ULL
1833 # define MAGIC65 0x218a019866014613ULL
1834 # define MAGIC11 0x23b
1835
1836 /* Return the square root if the input is a square, otherwise 0.  */
1837 ATTRIBUTE_CONST
1838 static uintmax_t
1839 is_square (uintmax_t x)
1840 {
1841   /* Uses the tests suggested by Cohen.  Excludes 99% of the non-squares before
1842      computing the square root.  */
1843   if (((MAGIC64 >> (x & 63)) & 1)
1844       && ((MAGIC63 >> (x % 63)) & 1)
1845       /* Both 0 and 64 are squares mod (65).  */
1846       && ((MAGIC65 >> ((x % 65) & 63)) & 1)
1847       && ((MAGIC11 >> (x % 11) & 1)))
1848     {
1849       uintmax_t r = isqrt (x);
1850       if (r * r == x)
1851         return r;
1852     }
1853   return 0;
1854 }
1855
1856 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1857 static short const invtab[0x81] =
1858   {
1859     0x200,
1860     0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1861     0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1862     0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1863     0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1864     0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1865     0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1866     0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1867     0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1868     0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1869     0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1870     0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1871     0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1872     0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1873     0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1874     0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1875     0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1876   };
1877
1878 /* Compute q = [u/d], r = u mod d.  Avoids slow hardware division for the case
1879    that q < 0x40; here it instead uses a table of (Euclidian) inverses.  */
1880 # define div_smallq(q, r, u, d)                                          \
1881   do {                                                                  \
1882     if ((u) / 0x40 < (d))                                               \
1883       {                                                                 \
1884         int _cnt;                                                       \
1885         uintmax_t _dinv, _mask, _q, _r;                                 \
1886         count_leading_zeros (_cnt, (d));                                \
1887         _r = (u);                                                       \
1888         if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8)))                        \
1889           {                                                             \
1890             _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80];   \
1891             _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt);                \
1892           }                                                             \
1893         else                                                            \
1894           {                                                             \
1895             _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f];   \
1896             _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11;        \
1897           }                                                             \
1898         _r -= _q * (d);                                                 \
1899                                                                         \
1900         _mask = -(uintmax_t) (_r >= (d));                               \
1901         (r) = _r - (_mask & (d));                                       \
1902         (q) = _q - _mask;                                               \
1903         affirm ((q) * (d) + (r) == u);                                  \
1904       }                                                                 \
1905     else                                                                \
1906       {                                                                 \
1907         uintmax_t _q = (u) / (d);                                       \
1908         (r) = (u) - _q * (d);                                           \
1909         (q) = _q;                                                       \
1910       }                                                                 \
1911   } while (0)
1912
1913 /* Notes: Example N = 22117019.  After first phase we find Q1 = 6314, Q
1914    = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1915    with 3025 = 55^2.
1916
1917    Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1918    representing G_0 = (-55, 2 * 4652, 8653).
1919
1920    In the notation of the paper:
1921
1922    S_{-1} = 55, S_0 = 8653, R_0 = 4652
1923
1924    Put
1925
1926      t_0 = floor([q_0 + R_0] / S0) = 1
1927      R_1 = t_0 * S_0 - R_0 = 4001
1928      S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1929 */
1930
1931 /* Multipliers, in order of efficiency:
1932    0.7268  3*5*7*11 = 1155 = 3 (mod 4)
1933    0.7317  3*5*7    =  105 = 1
1934    0.7820  3*5*11   =  165 = 1
1935    0.7872  3*5      =   15 = 3
1936    0.8101  3*7*11   =  231 = 3
1937    0.8155  3*7      =   21 = 1
1938    0.8284  5*7*11   =  385 = 1
1939    0.8339  5*7      =   35 = 3
1940    0.8716  3*11     =   33 = 1
1941    0.8774  3        =    3 = 3
1942    0.8913  5*11     =   55 = 3
1943    0.8972  5        =    5 = 1
1944    0.9233  7*11     =   77 = 1
1945    0.9295  7        =    7 = 3
1946    0.9934  11       =   11 = 3
1947 */
1948 # define QUEUE_SIZE 50
1949 #endif
1950
1951 #if STAT_SQUFOF
1952 # define Q_FREQ_SIZE 50
1953 /* Element 0 keeps the total */
1954 static int q_freq[Q_FREQ_SIZE + 1];
1955 #endif
1956
1957 #if USE_SQUFOF
1958 /* Return true on success.  Expected to fail only for numbers
1959    >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit.  */
1960 static bool
1961 factor_using_squfof (uintmax_t n1, uintmax_t n0, struct factors *factors)
1962 {
1963   /* Uses algorithm and notation from
1964
1965      SQUARE FORM FACTORIZATION
1966      JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1967
1968      https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1969    */
1970
1971   static short const multipliers_1[] =
1972     { /* = 1 (mod 4) */
1973       105, 165, 21, 385, 33, 5, 77, 1, 0
1974     };
1975   static short const multipliers_3[] =
1976     { /* = 3 (mod 4) */
1977       1155, 15, 231, 35, 3, 55, 7, 11, 0
1978     };
1979
1980   struct { uintmax_t Q; uintmax_t P; } queue[QUEUE_SIZE];
1981
1982   if (n1 >= ((uintmax_t) 1 << (W_TYPE_SIZE - 2)))
1983     return false;
1984
1985   uintmax_t sqrt_n = isqrt2 (n1, n0);
1986
1987   if (n0 == sqrt_n * sqrt_n)
1988     {
1989       uintmax_t p1, p0;
1990
1991       umul_ppmm (p1, p0, sqrt_n, sqrt_n);
1992       affirm (p0 == n0);
1993
1994       if (n1 == p1)
1995         {
1996           if (prime_p (sqrt_n))
1997             factor_insert_multiplicity (factors, sqrt_n, 2);
1998           else
1999             {
2000               struct factors f;
2001
2002               f.nfactors = 0;
2003               if (!factor_using_squfof (0, sqrt_n, &f))
2004                 {
2005                   /* Try pollard rho instead */
2006                   factor_using_pollard_rho (sqrt_n, 1, &f);
2007                 }
2008               /* Duplicate the new factors */
2009               for (unsigned int i = 0; i < f.nfactors; i++)
2010                 factor_insert_multiplicity (factors, f.p[i], 2 * f.e[i]);
2011             }
2012           return true;
2013         }
2014     }
2015
2016   /* Select multipliers so we always get n * mu = 3 (mod 4) */
2017   for (short const *m = (n0 % 4 == 1) ? multipliers_3 : multipliers_1;
2018        *m; m++)
2019     {
2020       uintmax_t S, Dh, Dl, Q1, Q, P, L, L1, B;
2021       unsigned int i;
2022       unsigned int mu = *m;
2023       int qpos = 0;
2024
2025       affirm (mu * n0 % 4 == 3);
2026
2027       /* In the notation of the paper, with mu * n == 3 (mod 4), we
2028          get \Delta = 4 mu * n, and the paper's \mu is 2 mu.  As far as
2029          I understand it, the necessary bound is 4 \mu^3 < n, or 32
2030          mu^3 < n.
2031
2032          However, this seems insufficient: With n = 37243139 and mu =
2033          105, we get a trivial factor, from the square 38809 = 197^2,
2034          without any corresponding Q earlier in the iteration.
2035
2036          Requiring 64 mu^3 < n seems sufficient.  */
2037       if (n1 == 0)
2038         {
2039           if ((uintmax_t) mu * mu * mu >= n0 / 64)
2040             continue;
2041         }
2042       else
2043         {
2044           if (n1 > ((uintmax_t) 1 << (W_TYPE_SIZE - 2)) / mu)
2045             continue;
2046         }
2047       umul_ppmm (Dh, Dl, n0, mu);
2048       Dh += n1 * mu;
2049
2050       affirm (Dl % 4 != 1);
2051       affirm (Dh < (uintmax_t) 1 << (W_TYPE_SIZE - 2));
2052
2053       S = isqrt2 (Dh, Dl);
2054
2055       Q1 = 1;
2056       P = S;
2057
2058       /* Square root remainder fits in one word, so ignore high part.  */
2059       Q = Dl - P * P;
2060       /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))?  */
2061       L = isqrt (2 * S);
2062       B = 2 * L;
2063       L1 = mu * 2 * L;
2064
2065       /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2066          4 D.  */
2067
2068       for (i = 0; i <= B; i++)
2069         {
2070           uintmax_t q, P1, t, rem;
2071
2072           div_smallq (q, rem, S + P, Q);
2073           P1 = S - rem; /* P1 = q*Q - P */
2074
2075           affirm (q > 0 && Q > 0);
2076
2077 # if STAT_SQUFOF
2078           q_freq[0]++;
2079           q_freq[MIN (q, Q_FREQ_SIZE)]++;
2080 # endif
2081
2082           if (Q <= L1)
2083             {
2084               uintmax_t g = Q;
2085
2086               if ((Q & 1) == 0)
2087                 g /= 2;
2088
2089               g /= gcd_odd (g, mu);
2090
2091               if (g <= L)
2092                 {
2093                   if (qpos >= QUEUE_SIZE)
2094                     error (EXIT_FAILURE, 0, _("squfof queue overflow"));
2095                   queue[qpos].Q = g;
2096                   queue[qpos].P = P % g;
2097                   qpos++;
2098                 }
2099             }
2100
2101           /* I think the difference can be either sign, but mod
2102              2^W_TYPE_SIZE arithmetic should be fine.  */
2103           t = Q1 + q * (P - P1);
2104           Q1 = Q;
2105           Q = t;
2106           P = P1;
2107
2108           if ((i & 1) == 0)
2109             {
2110               uintmax_t r = is_square (Q);
2111               if (r)
2112                 {
2113                   for (int j = 0; j < qpos; j++)
2114                     {
2115                       if (queue[j].Q == r)
2116                         {
2117                           if (r == 1)
2118                             /* Traversed entire cycle.  */
2119                             goto next_multiplier;
2120
2121                           /* Need the absolute value for divisibility test.  */
2122                           if (P >= queue[j].P)
2123                             t = P - queue[j].P;
2124                           else
2125                             t = queue[j].P - P;
2126                           if (t % r == 0)
2127                             {
2128                               /* Delete entries up to and including entry
2129                                  j, which matched.  */
2130                               memmove (queue, queue + j + 1,
2131                                        (qpos - j - 1) * sizeof (queue[0]));
2132                               qpos -= (j + 1);
2133                             }
2134                           goto next_i;
2135                         }
2136                     }
2137
2138                   /* We have found a square form, which should give a
2139                      factor.  */
2140                   Q1 = r;
2141                   affirm (S >= P); /* What signs are possible?  */
2142                   P += r * ((S - P) / r);
2143
2144                   /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2145                      for the case D = 2N.  */
2146                   /* Compute Q = (D - P*P) / Q1, but we need double
2147                      precision.  */
2148                   uintmax_t hi, lo;
2149                   umul_ppmm (hi, lo, P, P);
2150                   sub_ddmmss (hi, lo, Dh, Dl, hi, lo);
2151                   udiv_qrnnd (Q, rem, hi, lo, Q1);
2152                   affirm (rem == 0);
2153
2154                   for (;;)
2155                     {
2156                       /* Note: There appears to by a typo in the paper,
2157                          Step 4a in the algorithm description says q <--
2158                          floor([S+P]/\hat Q), but looking at the equations
2159                          in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2160                          (In this code, \hat Q is Q1).  */
2161                       div_smallq (q, rem, S + P, Q);
2162                       P1 = S - rem;     /* P1 = q*Q - P */
2163
2164 # if STAT_SQUFOF
2165                       q_freq[0]++;
2166                       q_freq[MIN (q, Q_FREQ_SIZE)]++;
2167 # endif
2168                       if (P == P1)
2169                         break;
2170                       t = Q1 + q * (P - P1);
2171                       Q1 = Q;
2172                       Q = t;
2173                       P = P1;
2174                     }
2175
2176                   if ((Q & 1) == 0)
2177                     Q /= 2;
2178                   Q /= gcd_odd (Q, mu);
2179
2180                   affirm (Q > 1 && (n1 || Q < n0));
2181
2182                   if (prime_p (Q))
2183                     factor_insert (factors, Q);
2184                   else if (!factor_using_squfof (0, Q, factors))
2185                     factor_using_pollard_rho (Q, 2, factors);
2186
2187                   divexact_21 (n1, n0, n1, n0, Q);
2188
2189                   if (prime2_p (n1, n0))
2190                     factor_insert_large (factors, n1, n0);
2191                   else
2192                     {
2193                       if (!factor_using_squfof (n1, n0, factors))
2194                         {
2195                           if (n1 == 0)
2196                             factor_using_pollard_rho (n0, 1, factors);
2197                           else
2198                             factor_using_pollard_rho2 (n1, n0, 1, factors);
2199                         }
2200                     }
2201
2202                   return true;
2203                 }
2204             }
2205         next_i:;
2206         }
2207     next_multiplier:;
2208     }
2209   return false;
2210 }
2211 #endif
2212
2213 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2214    results in FACTORS.  */
2215 static void
2216 factor (uintmax_t t1, uintmax_t t0, struct factors *factors)
2217 {
2218   factors->nfactors = 0;
2219   factors->plarge[1] = 0;
2220
2221   if (t1 == 0 && t0 < 2)
2222     return;
2223
2224   t0 = factor_using_division (&t1, t1, t0, factors);
2225
2226   if (t1 == 0 && t0 < 2)
2227     return;
2228
2229   if (prime2_p (t1, t0))
2230     factor_insert_large (factors, t1, t0);
2231   else
2232     {
2233 #if USE_SQUFOF
2234       if (factor_using_squfof (t1, t0, factors))
2235         return;
2236 #endif
2237
2238       if (t1 == 0)
2239         factor_using_pollard_rho (t0, 1, factors);
2240       else
2241         factor_using_pollard_rho2 (t1, t0, 1, factors);
2242     }
2243 }
2244
2245 /* Use Pollard-rho to compute the prime factors of
2246    arbitrary-precision T, and put the results in FACTORS.  */
2247 static void
2248 mp_factor (mpz_t t, struct mp_factors *factors)
2249 {
2250   mp_factor_init (factors);
2251
2252   if (mpz_sgn (t) != 0)
2253     {
2254       mp_factor_using_division (t, factors);
2255
2256       if (mpz_cmp_ui (t, 1) != 0)
2257         {
2258           devmsg ("[is number prime?] ");
2259           if (mp_prime_p (t))
2260             mp_factor_insert (factors, t);
2261           else
2262             mp_factor_using_pollard_rho (t, 1, factors);
2263         }
2264     }
2265 }
2266
2267 static strtol_error
2268 strto2uintmax (uintmax_t *hip, uintmax_t *lop, char const *s)
2269 {
2270   int lo_carry;
2271   uintmax_t hi = 0, lo = 0;
2272
2273   strtol_error err = LONGINT_INVALID;
2274
2275   /* Initial scan for invalid digits.  */
2276   char const *p = s;
2277   for (;;)
2278     {
2279       unsigned char c = *p++;
2280       if (c == 0)
2281         break;
2282
2283       if (UNLIKELY (!ISDIGIT (c)))
2284         {
2285           err = LONGINT_INVALID;
2286           break;
2287         }
2288
2289       err = LONGINT_OK;           /* we've seen at least one valid digit */
2290     }
2291
2292   while (err == LONGINT_OK)
2293     {
2294       unsigned char c = *s++;
2295       if (c == 0)
2296         break;
2297
2298       c -= '0';
2299
2300       if (UNLIKELY (hi > ~(uintmax_t)0 / 10))
2301         {
2302           err = LONGINT_OVERFLOW;
2303           break;
2304         }
2305       hi = 10 * hi;
2306
2307       lo_carry = (lo >> (W_TYPE_SIZE - 3)) + (lo >> (W_TYPE_SIZE - 1));
2308       lo_carry += 10 * lo < 2 * lo;
2309
2310       lo = 10 * lo;
2311       lo += c;
2312
2313       lo_carry += lo < c;
2314       hi += lo_carry;
2315       if (UNLIKELY (hi < lo_carry))
2316         {
2317           err = LONGINT_OVERFLOW;
2318           break;
2319         }
2320     }
2321
2322   *hip = hi;
2323   *lop = lo;
2324
2325   return err;
2326 }
2327
2328 /* Structure and routines for buffering and outputting full lines,
2329    to support parallel operation efficiently.  */
2330 static struct lbuf_
2331 {
2332   char *buf;
2333   char *end;
2334 } lbuf;
2335
2336 /* 512 is chosen to give good performance,
2337    and also is the max guaranteed size that
2338    consumers can read atomically through pipes.
2339    Also it's big enough to cater for max line length
2340    even with 128 bit uintmax_t.  */
2341 #define FACTOR_PIPE_BUF 512
2342
2343 static void
2344 lbuf_alloc (void)
2345 {
2346   if (lbuf.buf)
2347     return;
2348
2349   /* Double to ensure enough space for
2350      previous numbers + next number.  */
2351   lbuf.buf = xmalloc (FACTOR_PIPE_BUF * 2);
2352   lbuf.end = lbuf.buf;
2353 }
2354
2355 /* Write complete LBUF to standard output.  */
2356 static void
2357 lbuf_flush (void)
2358 {
2359   size_t size = lbuf.end - lbuf.buf;
2360   if (full_write (STDOUT_FILENO, lbuf.buf, size) != size)
2361     write_error ();
2362   lbuf.end = lbuf.buf;
2363 }
2364
2365 /* Add a character C to LBUF and if it's a newline
2366    and enough bytes are already buffered,
2367    then write atomically to standard output.  */
2368 static void
2369 lbuf_putc (char c)
2370 {
2371   *lbuf.end++ = c;
2372
2373   if (c == '\n')
2374     {
2375       size_t buffered = lbuf.end - lbuf.buf;
2376
2377       /* Provide immediate output for interactive use.  */
2378       static int line_buffered = -1;
2379       if (line_buffered == -1)
2380         line_buffered = isatty (STDIN_FILENO) || isatty (STDOUT_FILENO);
2381       if (line_buffered)
2382         lbuf_flush ();
2383       else if (buffered >= FACTOR_PIPE_BUF)
2384         {
2385           /* Write output in <= PIPE_BUF chunks
2386              so consumers can read atomically.  */
2387           char const *tend = lbuf.end;
2388
2389           /* Since a umaxint_t's factors must fit in 512
2390              we're guaranteed to find a newline here.  */
2391           char *tlend = lbuf.buf + FACTOR_PIPE_BUF;
2392           while (*--tlend != '\n');
2393           tlend++;
2394
2395           lbuf.end = tlend;
2396           lbuf_flush ();
2397
2398           /* Buffer the remainder.  */
2399           memcpy (lbuf.buf, tlend, tend - tlend);
2400           lbuf.end = lbuf.buf + (tend - tlend);
2401         }
2402     }
2403 }
2404
2405 /* Buffer an int to the internal LBUF.  */
2406 static void
2407 lbuf_putint (uintmax_t i, size_t min_width)
2408 {
2409   char buf[INT_BUFSIZE_BOUND (uintmax_t)];
2410   char const *umaxstr = umaxtostr (i, buf);
2411   size_t width = sizeof (buf) - (umaxstr - buf) - 1;
2412   size_t z = width;
2413
2414   for (; z < min_width; z++)
2415     *lbuf.end++ = '0';
2416
2417   memcpy (lbuf.end, umaxstr, width);
2418   lbuf.end += width;
2419 }
2420
2421 static void
2422 print_uintmaxes (uintmax_t t1, uintmax_t t0)
2423 {
2424   uintmax_t q, r;
2425
2426   if (t1 == 0)
2427     lbuf_putint (t0, 0);
2428   else
2429     {
2430       /* Use very plain code here since it seems hard to write fast code
2431          without assuming a specific word size.  */
2432       q = t1 / 1000000000;
2433       r = t1 % 1000000000;
2434       udiv_qrnnd (t0, r, r, t0, 1000000000);
2435       print_uintmaxes (q, t0);
2436       lbuf_putint (r, 9);
2437     }
2438 }
2439
2440 /* Single-precision factoring */
2441 static void
2442 print_factors_single (uintmax_t t1, uintmax_t t0)
2443 {
2444   struct factors factors;
2445
2446   print_uintmaxes (t1, t0);
2447   lbuf_putc (':');
2448
2449   factor (t1, t0, &factors);
2450
2451   for (int j = 0; j < factors.nfactors; j++)
2452     for (int k = 0; k < factors.e[j]; k++)
2453       {
2454         lbuf_putc (' ');
2455         print_uintmaxes (0, factors.p[j]);
2456         if (print_exponents && factors.e[j] > 1)
2457           {
2458             lbuf_putc ('^');
2459             lbuf_putint (factors.e[j], 0);
2460             break;
2461           }
2462       }
2463
2464   if (factors.plarge[1])
2465     {
2466       lbuf_putc (' ');
2467       print_uintmaxes (factors.plarge[1], factors.plarge[0]);
2468     }
2469
2470   lbuf_putc ('\n');
2471 }
2472
2473 /* Emit the factors of the indicated number.  If we have the option of using
2474    either algorithm, we select on the basis of the length of the number.
2475    For longer numbers, we prefer the MP algorithm even if the native algorithm
2476    has enough digits, because the algorithm is better.  The turnover point
2477    depends on the value.  */
2478 static bool
2479 print_factors (char const *input)
2480 {
2481   /* Skip initial spaces and '+'.  */
2482   char const *str = input;
2483   while (*str == ' ')
2484     str++;
2485   str += *str == '+';
2486
2487   uintmax_t t1, t0;
2488
2489   /* Try converting the number to one or two words.  If it fails, use GMP or
2490      print an error message.  The 2nd condition checks that the most
2491      significant bit of the two-word number is clear, in a typesize neutral
2492      way.  */
2493   strtol_error err = strto2uintmax (&t1, &t0, str);
2494
2495   switch (err)
2496     {
2497     case LONGINT_OK:
2498       if (((t1 << 1) >> 1) == t1)
2499         {
2500           devmsg ("[using single-precision arithmetic] ");
2501           print_factors_single (t1, t0);
2502           return true;
2503         }
2504       break;
2505
2506     case LONGINT_OVERFLOW:
2507       /* Try GMP.  */
2508       break;
2509
2510     default:
2511       error (0, 0, _("%s is not a valid positive integer"), quote (input));
2512       return false;
2513     }
2514
2515   devmsg ("[using arbitrary-precision arithmetic] ");
2516   mpz_t t;
2517   struct mp_factors factors;
2518
2519   mpz_init_set_str (t, str, 10);
2520
2521   mpz_out_str (stdout, 10, t);
2522   putchar (':');
2523   mp_factor (t, &factors);
2524
2525   for (idx_t j = 0; j < factors.nfactors; j++)
2526     for (unsigned long int k = 0; k < factors.e[j]; k++)
2527       {
2528         putchar (' ');
2529         mpz_out_str (stdout, 10, factors.p[j]);
2530         if (print_exponents && factors.e[j] > 1)
2531           {
2532             printf ("^%lu", factors.e[j]);
2533             break;
2534           }
2535       }
2536
2537   mp_factor_clear (&factors);
2538   mpz_clear (t);
2539   putchar ('\n');
2540   fflush (stdout);
2541   return true;
2542 }
2543
2544 void
2545 usage (int status)
2546 {
2547   if (status != EXIT_SUCCESS)
2548     emit_try_help ();
2549   else
2550     {
2551       printf (_("\
2552 Usage: %s [OPTION] [NUMBER]...\n\
2553 "),
2554               program_name);
2555       fputs (_("\
2556 Print the prime factors of each specified integer NUMBER.  If none\n\
2557 are specified on the command line, read them from standard input.\n\
2558 \n\
2559 "), stdout);
2560       fputs ("\
2561   -h, --exponents   print repeated factors in form p^e unless e is 1\n\
2562 ", stdout);
2563       fputs (HELP_OPTION_DESCRIPTION, stdout);
2564       fputs (VERSION_OPTION_DESCRIPTION, stdout);
2565       emit_ancillary_info (PROGRAM_NAME);
2566     }
2567   exit (status);
2568 }
2569
2570 static bool
2571 do_stdin (void)
2572 {
2573   bool ok = true;
2574   token_buffer tokenbuffer;
2575
2576   init_tokenbuffer (&tokenbuffer);
2577
2578   while (true)
2579     {
2580       size_t token_length = readtoken (stdin, DELIM, sizeof (DELIM) - 1,
2581                                        &tokenbuffer);
2582       if (token_length == (size_t) -1)
2583         {
2584           if (ferror (stdin))
2585             error (EXIT_FAILURE, errno, _("error reading input"));
2586           break;
2587         }
2588
2589       ok &= print_factors (tokenbuffer.buffer);
2590     }
2591   free (tokenbuffer.buffer);
2592
2593   return ok;
2594 }
2595
2596 int
2597 main (int argc, char **argv)
2598 {
2599   initialize_main (&argc, &argv);
2600   set_program_name (argv[0]);
2601   setlocale (LC_ALL, "");
2602   bindtextdomain (PACKAGE, LOCALEDIR);
2603   textdomain (PACKAGE);
2604
2605   lbuf_alloc ();
2606   atexit (close_stdout);
2607   atexit (lbuf_flush);
2608
2609   int c;
2610   while ((c = getopt_long (argc, argv, "h", long_options, nullptr)) != -1)
2611     {
2612       switch (c)
2613         {
2614         case 'h':  /* NetBSD used -h for this functionality first.  */
2615           print_exponents = true;
2616           break;
2617
2618         case DEV_DEBUG_OPTION:
2619           dev_debug = true;
2620           break;
2621
2622         case_GETOPT_HELP_CHAR;
2623
2624         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2625
2626         default:
2627           usage (EXIT_FAILURE);
2628         }
2629     }
2630
2631 #if STAT_SQUFOF
2632   memset (q_freq, 0, sizeof (q_freq));
2633 #endif
2634
2635   bool ok;
2636   if (argc <= optind)
2637     ok = do_stdin ();
2638   else
2639     {
2640       ok = true;
2641       for (int i = optind; i < argc; i++)
2642         if (! print_factors (argv[i]))
2643           ok = false;
2644     }
2645
2646 #if STAT_SQUFOF
2647   if (q_freq[0] > 0)
2648     {
2649       double acc_f;
2650       printf ("q  freq.  cum. freq.(total: %d)\n", q_freq[0]);
2651       for (int i = 1, acc_f = 0.0; i <= Q_FREQ_SIZE; i++)
2652         {
2653           double f = (double) q_freq[i] / q_freq[0];
2654           acc_f += f;
2655           printf ("%s%d %.2f%% %.2f%%\n", i == Q_FREQ_SIZE ? ">=" : "", i,
2656                   100.0 * f, 100.0 * acc_f);
2657         }
2658     }
2659 #endif
2660
2661   return ok ? EXIT_SUCCESS : EXIT_FAILURE;
2662 }