1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
40 (1) Perform trial division using a small primes table, but without hardware
41 division since the primes table store inverses modulo the word base.
42 (The GMP variant of this code doesn't make use of the precomputed
43 inverses, but instead relies on GMP for fast divisibility testing.)
44 (2) Check the nature of any non-factored part using Miller-Rabin for
45 detecting composites, and Lucas for detecting primes.
46 (3) Factor any remaining composite part using the Pollard-Brent rho
47 algorithm or if USE_SQUFOF is defined to 1, try that first.
48 Status of found factors are checked again using Miller-Rabin and Lucas.
50 We prefer using Hensel norm in the divisions, not the more familiar
51 Euclidian norm, since the former leads to much faster code. In the
52 Pollard-Brent rho code and the prime testing code, we use Montgomery's
53 trick of multiplying all n-residues by the word base, allowing cheap Hensel
56 The GMP code uses an algorithm that can be considerably slower;
57 for example, on a circa-2017 Intel Xeon Silver 4116, factoring
58 2^{127}-3 takes about 50 ms with the two-word algorithm but would
59 take about 750 ms with the GMP code.
63 * Use modular inverses also for exact division in the Lucas code, and
64 elsewhere. A problem is to locate the inverses not from an index, but
65 from a prime. We might instead compute the inverse on-the-fly.
67 * Tune trial division table size (not forgetting that this is a standalone
68 program where the table will be read from secondary storage for
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
111 #include "full-write.h"
113 #include "readtokens.h"
116 /* The official name of this program (e.g., no 'g' prefix). */
117 #define PROGRAM_NAME "factor"
120 proper_name ("Paul Rubin"), \
121 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
122 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
124 /* Token delimiters when reading from a file. */
125 #define DELIM "\n\t "
127 #ifndef USE_LONGLONG_H
128 /* With the way we use longlong.h, it's only safe to use
129 when UWtype = UHWtype, as there were various cases
130 (as can be seen in the history for longlong.h) where
131 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
132 to avoid compile time or run time issues. */
133 # if LONG_MAX == INTMAX_MAX
134 # define USE_LONGLONG_H 1
140 /* Make definitions for longlong.h to make it do what it can do for us */
142 /* bitcount for uintmax_t */
143 # if UINTMAX_MAX == UINT32_MAX
144 # define W_TYPE_SIZE 32
145 # elif UINTMAX_MAX == UINT64_MAX
146 # define W_TYPE_SIZE 64
147 # elif UINTMAX_MAX == UINT128_MAX
148 # define W_TYPE_SIZE 128
151 # define UWtype uintmax_t
152 # define UHWtype unsigned long int
154 # if HAVE_ATTRIBUTE_MODE
155 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
156 typedef int SItype
__attribute__ ((mode (SI
)));
157 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
158 typedef int DItype
__attribute__ ((mode (DI
)));
159 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
161 typedef unsigned char UQItype
;
163 typedef unsigned long int USItype
;
164 # if HAVE_LONG_LONG_INT
165 typedef long long int DItype
;
166 typedef unsigned long long int UDItype
;
167 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
168 typedef long int DItype
;
169 typedef unsigned long int UDItype
;
172 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
173 # define ASSERT(x) /* FIXME make longlong.h really standalone */
174 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
175 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
176 # ifndef __GMP_GNUC_PREREQ
177 # define __GMP_GNUC_PREREQ(a,b) 1
180 /* These stub macros are only used in longlong.h in certain system compiler
181 combinations, so ensure usage to avoid -Wunused-macros warnings. */
182 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
188 # define HAVE_HOST_CPU_FAMILY_powerpc 1
190 # include "longlong.h"
191 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
192 const unsigned char factor_clz_tab
[129] =
194 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
195 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
196 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
197 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
202 #else /* not USE_LONGLONG_H */
204 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
205 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
206 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
207 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
211 #if !defined __clz_tab && !defined UHWtype
212 /* Without this seemingly useless conditional, gcc -Wunused-macros
213 warns that each of the two tested macros is unused on Fedora 18.
214 FIXME: this is just an ugly band-aid. Fix it properly. */
217 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
218 #define MAX_NFACTS 26
222 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
225 static struct option
const long_options
[] =
227 {"exponents", no_argument
, nullptr, 'h'},
228 {"-debug", no_argument
, nullptr, DEV_DEBUG_OPTION
},
229 {GETOPT_HELP_OPTION_DECL
},
230 {GETOPT_VERSION_OPTION_DECL
},
231 {nullptr, 0, nullptr, 0}
234 /* If true, use p^e output format. */
235 static bool print_exponents
;
239 uintmax_t plarge
[2]; /* Can have a single large factor */
240 uintmax_t p
[MAX_NFACTS
];
241 unsigned char e
[MAX_NFACTS
];
242 unsigned char nfactors
;
248 unsigned long int *e
;
252 static void factor (uintmax_t, uintmax_t, struct factors
*);
255 # define umul_ppmm(w1, w0, u, v) \
257 uintmax_t __x0, __x1, __x2, __x3; \
258 unsigned long int __ul, __vl, __uh, __vh; \
259 uintmax_t __u = (u), __v = (v); \
261 __ul = __ll_lowpart (__u); \
262 __uh = __ll_highpart (__u); \
263 __vl = __ll_lowpart (__v); \
264 __vh = __ll_highpart (__v); \
266 __x0 = (uintmax_t) __ul * __vl; \
267 __x1 = (uintmax_t) __ul * __vh; \
268 __x2 = (uintmax_t) __uh * __vl; \
269 __x3 = (uintmax_t) __uh * __vh; \
271 __x1 += __ll_highpart (__x0);/* This can't give carry. */ \
272 __x1 += __x2; /* But this indeed can. */ \
273 if (__x1 < __x2) /* Did we get it? */ \
274 __x3 += __ll_B; /* Yes, add it in the proper pos. */ \
276 (w1) = __x3 + __ll_highpart (__x1); \
277 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
281 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
282 /* Define our own, not needing normalization. This function is
283 currently not performance critical, so keep it simple. Similar to
284 the mod macro below. */
286 # define udiv_qrnnd(q, r, n1, n0, d) \
288 uintmax_t __d1, __d0, __q, __r1, __r0; \
290 __d1 = (d); __d0 = 0; \
291 __r1 = (n1); __r0 = (n0); \
292 affirm (__r1 < __d1); \
294 for (int __i = W_TYPE_SIZE; __i > 0; __i--) \
296 rsh2 (__d1, __d0, __d1, __d0, 1); \
298 if (ge2 (__r1, __r0, __d1, __d0)) \
301 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
309 #if !defined add_ssaaaa
310 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
313 _add_x = (al) + (bl); \
314 (sh) = (ah) + (bh) + (_add_x < (al)); \
319 #define rsh2(rh, rl, ah, al, cnt) \
321 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
322 (rh) = (ah) >> (cnt); \
325 #define lsh2(rh, rl, ah, al, cnt) \
327 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
328 (rl) = (al) << (cnt); \
331 #define ge2(ah, al, bh, bl) \
332 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
334 #define gt2(ah, al, bh, bl) \
335 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
338 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
342 (rl) = (al) - (bl); \
343 (rh) = (ah) - (bh) - _cy; \
347 #ifndef count_leading_zeros
348 # define count_leading_zeros(count, x) do { \
349 uintmax_t __clz_x = (x); \
352 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
355 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
361 #ifndef count_trailing_zeros
362 # define count_trailing_zeros(count, x) do { \
363 uintmax_t __ctz_x = (x); \
365 while ((__ctz_x & 1) == 0) \
374 /* Requires that a < n and b <= n */
375 #define submod(r,a,b,n) \
377 uintmax_t _t = - (uintmax_t) (a < b); \
378 (r) = ((n) & _t) + (a) - (b); \
381 #define addmod(r,a,b,n) \
382 submod ((r), (a), ((n) - (b)), (n))
384 /* Modular two-word addition and subtraction. For performance reasons, the
385 most significant bit of n1 must be clear. The destination variables must be
386 distinct from the mod operand. */
387 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
389 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
390 if (ge2 ((r1), (r0), (n1), (n0))) \
391 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
393 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
395 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
396 if ((intmax_t) (r1) < 0) \
397 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
400 #define HIGHBIT_TO_MASK(x) \
401 (((intmax_t)-1 >> 1) < 0 \
402 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
403 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
404 ? UINTMAX_MAX : (uintmax_t) 0))
406 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
407 Requires that d1 != 0. */
409 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
421 count_leading_zeros (cntd
, d1
);
422 count_leading_zeros (cnta
, a1
);
423 int cnt
= cntd
- cnta
;
424 lsh2 (d1
, d0
, d1
, d0
, cnt
);
425 for (int i
= 0; i
< cnt
; i
++)
427 if (ge2 (a1
, a0
, d1
, d0
))
428 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
429 rsh2 (d1
, d0
, d1
, d0
, 1);
438 gcd_odd (uintmax_t a
, uintmax_t b
)
449 /* Take out least significant one bit, to make room for sign */
465 bgta
= HIGHBIT_TO_MASK (t
);
467 /* b <-- min (a, b) */
471 a
= (t
^ bgta
) - bgta
;
476 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
486 while ((a0
& 1) == 0)
487 rsh2 (a1
, a0
, a1
, a0
, 1);
494 return gcd_odd (b0
, a0
);
497 if (gt2 (a1
, a0
, b1
, b0
))
499 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
501 rsh2 (a1
, a0
, a1
, a0
, 1);
502 while ((a0
& 1) == 0);
504 else if (gt2 (b1
, b0
, a1
, a0
))
506 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
508 rsh2 (b1
, b0
, b1
, b0
, 1);
509 while ((b0
& 1) == 0);
520 factor_insert_multiplicity (struct factors
*factors
,
521 uintmax_t prime
, int m
)
523 int nfactors
= factors
->nfactors
;
524 uintmax_t *p
= factors
->p
;
525 unsigned char *e
= factors
->e
;
527 /* Locate position for insert new or increment e. */
529 for (i
= nfactors
- 1; i
>= 0; i
--)
535 if (i
< 0 || p
[i
] != prime
)
537 for (int j
= nfactors
- 1; j
> i
; j
--)
544 factors
->nfactors
= nfactors
+ 1;
552 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
555 factor_insert_large (struct factors
*factors
,
556 uintmax_t p1
, uintmax_t p0
)
560 affirm (factors
->plarge
[1] == 0);
561 factors
->plarge
[0] = p0
;
562 factors
->plarge
[1] = p1
;
565 factor_insert (factors
, p0
);
572 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
573 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
576 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
580 va_start (ap
, mpz_single_init
);
583 while ((mpz
= va_arg (ap
, mpz_t
*)))
584 mpz_single_init (*mpz
);
590 static void mp_factor (mpz_t
, struct mp_factors
*);
593 mp_factor_init (struct mp_factors
*factors
)
595 factors
->p
= nullptr;
596 factors
->e
= nullptr;
597 factors
->nfactors
= 0;
601 mp_factor_clear (struct mp_factors
*factors
)
603 for (idx_t i
= 0; i
< factors
->nfactors
; i
++)
604 mpz_clear (factors
->p
[i
]);
611 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
613 idx_t nfactors
= factors
->nfactors
;
614 mpz_t
*p
= factors
->p
;
615 unsigned long int *e
= factors
->e
;
618 /* Locate position for insert new or increment e. */
619 for (i
= nfactors
- 1; i
>= 0; i
--)
621 if (mpz_cmp (p
[i
], prime
) <= 0)
625 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
627 p
= xireallocarray (p
, nfactors
+ 1, sizeof p
[0]);
628 e
= xireallocarray (e
, nfactors
+ 1, sizeof e
[0]);
630 mpz_init (p
[nfactors
]);
631 for (long j
= nfactors
- 1; j
> i
; j
--)
633 mpz_set (p
[j
+ 1], p
[j
]);
636 mpz_set (p
[i
+ 1], prime
);
641 factors
->nfactors
= nfactors
+ 1;
650 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
654 mpz_init_set_ui (pz
, prime
);
655 mp_factor_insert (factors
, pz
);
660 /* Number of bits in an uintmax_t. */
661 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
663 /* Verify that uintmax_t does not have holes in its representation. */
664 static_assert (UINTMAX_MAX
>> (W
- 1) == 1);
666 #define P(a,b,c,d) a,
667 static const unsigned char primes_diff
[] = {
669 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
673 #define PRIMES_PTAB_ENTRIES \
674 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
676 #define P(a,b,c,d) b,
677 static const unsigned char primes_diff8
[] = {
679 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
688 #define P(a,b,c,d) {c,d},
689 static const struct primes_dtab primes_dtab
[] = {
691 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
695 /* Verify that uintmax_t is not wider than
696 the integers used to generate primes.h. */
697 static_assert (W
<= WIDE_UINT_BITS
);
699 /* debugging for developers. Enables devmsg().
700 This flag is used only in the GMP code. */
701 static bool dev_debug
= false;
703 /* Prove primality or run probabilistic tests. */
704 static bool flag_prove_primality
= PROVE_PRIMALITY
;
706 /* Number of Miller-Rabin tests to run when not proving primality. */
710 factor_insert_refind (struct factors
*factors
, uintmax_t p
, int i
, int off
)
712 for (int j
= 0; j
< off
; j
++)
713 p
+= primes_diff
[i
+ j
];
714 factor_insert (factors
, p
);
717 /* Trial division with odd primes uses the following trick.
719 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
720 consider the case t < B (this is the second loop below).
722 From our tables we get
724 binv = p^{-1} (mod B)
725 lim = floor ((B-1) / p).
727 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
728 (and all quotients in this range occur for some t).
730 Then t = q * p is true also (mod B), and p is invertible we get
732 q = t * binv (mod B).
734 Next, assume that t is *not* divisible by p. Since multiplication
735 by binv (mod B) is a one-to-one mapping,
737 t * binv (mod B) > lim,
739 because all the smaller values are already taken.
741 This can be summed up by saying that the function
743 q(t) = binv * t (mod B)
745 is a permutation of the range 0 <= t < B, with the curious property
746 that it maps the multiples of p onto the range 0 <= q <= lim, in
747 order, and the non-multiples of p onto the range lim < q < B.
751 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
752 struct factors
*factors
)
760 count_trailing_zeros (cnt
, t1
);
767 count_trailing_zeros (cnt
, t0
);
768 rsh2 (t1
, t0
, t1
, t0
, cnt
);
771 factor_insert_multiplicity (factors
, 2, cnt
);
776 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
780 uintmax_t q1
, q0
, hi
;
781 MAYBE_UNUSED
uintmax_t lo
;
783 q0
= t0
* primes_dtab
[i
].binv
;
784 umul_ppmm (hi
, lo
, q0
, p
);
788 q1
= hi
* primes_dtab
[i
].binv
;
789 if (LIKELY (q1
> primes_dtab
[i
].lim
))
792 factor_insert (factors
, p
);
794 p
+= primes_diff
[i
+ 1];
799 #define DIVBLOCK(I) \
803 q = t0 * pd[I].binv; \
804 if (LIKELY (q > pd[I].lim)) \
807 factor_insert_refind (factors, p, i + 1, I); \
811 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
814 const struct primes_dtab
*pd
= &primes_dtab
[i
];
824 p
+= primes_diff8
[i
];
833 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
838 devmsg ("[trial division] ");
842 p
= mpz_scan1 (t
, 0);
843 mpz_fdiv_q_2exp (t
, t
, p
);
846 mp_factor_insert_ui (factors
, 2);
850 unsigned long int d
= 3;
851 for (idx_t i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
853 if (! mpz_divisible_ui_p (t
, d
))
855 d
+= primes_diff
[i
++];
856 if (mpz_cmp_ui (t
, d
* d
) < 0)
861 mpz_tdiv_q_ui (t
, t
, d
);
862 mp_factor_insert_ui (factors
, d
);
869 /* Entry i contains (2i+1)^(-1) mod 2^8. */
870 static const unsigned char binvert_table
[128] =
872 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
873 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
874 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
875 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
876 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
877 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
878 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
879 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
880 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
881 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
882 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
883 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
884 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
885 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
886 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
887 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
890 /* Compute n^(-1) mod B, using a Newton iteration. */
891 #define binv(inv,n) \
893 uintmax_t __n = (n); \
896 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
897 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
898 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
899 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
901 if (W_TYPE_SIZE > 64) \
903 int __invbits = 64; \
905 __inv = 2 * __inv - __inv * __inv * __n; \
907 } while (__invbits < W_TYPE_SIZE); \
913 /* q = u / d, assuming d|u. */
914 #define divexact_21(q1, q0, u1, u0, d) \
916 uintmax_t _di, _q0; \
922 MAYBE_UNUSED intmax_t _p0; \
923 umul_ppmm (_p1, _p0, _q0, d); \
924 (q1) = ((u1) - _p1) * _di; \
935 #define redcify(r_prim, r, n) \
937 MAYBE_UNUSED uintmax_t _redcify_q; \
938 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
941 /* x B^2 (mod n). Requires x > 0, n1 < B/2. */
942 #define redcify2(r1, r0, x, n1, n0) \
944 uintmax_t _r1, _r0, _i; \
947 _r1 = (x); _r0 = 0; \
952 _r1 = 0; _r0 = (x); \
953 _i = 2 * W_TYPE_SIZE; \
957 lsh2 (_r1, _r0, _r1, _r0, 1); \
958 if (ge2 (_r1, _r0, (n1), (n0))) \
959 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
965 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
966 Both a and b must be in redc form, the result will be in redc form too. */
967 static inline uintmax_t
968 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
970 uintmax_t rh
, rl
, q
, th
, xh
;
971 MAYBE_UNUSED
uintmax_t tl
;
973 umul_ppmm (rh
, rl
, a
, b
);
975 umul_ppmm (th
, tl
, q
, m
);
983 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
984 Both a and b must be in redc form, the result will be in redc form too.
985 For performance reasons, the most significant bit of m must be clear. */
987 mulredc2 (uintmax_t *r1p
,
988 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
989 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
991 uintmax_t r1
, r0
, q
, p1
, t1
, t0
, s1
, s0
;
992 MAYBE_UNUSED
uintmax_t p0
;
994 affirm ((a1
>> (W_TYPE_SIZE
- 1)) == 0);
995 affirm ((b1
>> (W_TYPE_SIZE
- 1)) == 0);
996 affirm ((m1
>> (W_TYPE_SIZE
- 1)) == 0);
998 /* First compute a0 * <b1, b0> B^{-1}
1011 umul_ppmm (t1
, t0
, a0
, b0
);
1012 umul_ppmm (r1
, r0
, a0
, b1
);
1014 umul_ppmm (p1
, p0
, q
, m0
);
1015 umul_ppmm (s1
, s0
, q
, m1
);
1016 r0
+= (t0
!= 0); /* Carry */
1017 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1018 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1019 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1021 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1036 umul_ppmm (t1
, t0
, a1
, b0
);
1037 umul_ppmm (s1
, s0
, a1
, b1
);
1038 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1040 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1041 umul_ppmm (p1
, p0
, q
, m0
);
1042 umul_ppmm (s1
, s0
, q
, m1
);
1043 r0
+= (t0
!= 0); /* Carry */
1044 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1045 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1046 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1048 if (ge2 (r1
, r0
, m1
, m0
))
1049 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1057 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1066 b
= mulredc (b
, b
, n
, ni
);
1070 y
= mulredc (y
, b
, n
, ni
);
1077 powm2 (uintmax_t *r1m
,
1078 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1079 uintmax_t ni
, const uintmax_t *one
)
1081 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1093 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1097 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1100 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1103 for (e
= ep
[1]; e
> 0; e
>>= 1)
1107 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1110 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1119 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1120 int k
, uintmax_t one
)
1122 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1124 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1126 if (y
== one
|| y
== nm1
)
1129 for (int i
= 1; i
< k
; i
++)
1131 y
= mulredc (y
, y
, n
, ni
);
1141 ATTRIBUTE_PURE
static bool
1142 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1143 const uintmax_t *qp
, int k
, const uintmax_t *one
)
1145 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1147 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1150 if (y0
== one
[0] && y1
== one
[1])
1153 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1155 if (y0
== nm1_0
&& y1
== nm1_1
)
1158 for (int i
= 1; i
< k
; i
++)
1160 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1163 if (y0
== nm1_0
&& y1
== nm1_1
)
1165 if (y0
== one
[0] && y1
== one
[1])
1172 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1173 mpz_srcptr q
, mp_bitcnt_t k
)
1175 mpz_powm (y
, x
, q
, n
);
1177 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1180 for (mp_bitcnt_t i
= 1; i
< k
; i
++)
1182 mpz_powm_ui (y
, y
, 2, n
);
1183 if (mpz_cmp (y
, nm1
) == 0)
1185 if (mpz_cmp_ui (y
, 1) == 0)
1191 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1192 have been observed. The average seem to be about 2. */
1193 static bool ATTRIBUTE_PURE
1194 prime_p (uintmax_t n
)
1198 uintmax_t a_prim
, one
, ni
;
1199 struct factors factors
;
1204 /* We have already casted out small primes. */
1205 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1208 /* Precomputation for Miller-Rabin. */
1209 uintmax_t q
= n
- 1;
1210 for (k
= 0; (q
& 1) == 0; k
++)
1214 binv (ni
, n
); /* ni <- 1/n mod B */
1215 redcify (one
, 1, n
);
1216 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1218 /* Perform a Miller-Rabin test, finds most composites quickly. */
1219 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1222 if (flag_prove_primality
)
1224 /* Factor n-1 for Lucas. */
1225 factor (0, n
- 1, &factors
);
1228 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1229 number composite. */
1230 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1232 if (flag_prove_primality
)
1235 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1238 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1243 /* After enough Miller-Rabin runs, be content. */
1244 is_prime
= (r
== MR_REPS
- 1);
1250 a
+= primes_diff
[r
]; /* Establish new base. */
1252 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1253 on most processors, since it avoids udiv_qrnnd. If we go down the
1254 udiv_qrnnd_preinv path, this code should be replaced. */
1257 umul_ppmm (s1
, s0
, one
, a
);
1258 if (LIKELY (s1
== 0))
1262 MAYBE_UNUSED
uintmax_t dummy
;
1263 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1267 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1271 affirm (!"Lucas prime test failure. This should not happen");
1274 static bool ATTRIBUTE_PURE
1275 prime2_p (uintmax_t n1
, uintmax_t n0
)
1277 uintmax_t q
[2], nm1
[2];
1278 uintmax_t a_prim
[2];
1283 struct factors factors
;
1286 return prime_p (n0
);
1288 nm1
[1] = n1
- (n0
== 0);
1292 count_trailing_zeros (k
, nm1
[1]);
1300 count_trailing_zeros (k
, nm1
[0]);
1301 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1306 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1307 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1309 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1313 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1316 if (flag_prove_primality
)
1318 /* Factor n-1 for Lucas. */
1319 factor (nm1
[1], nm1
[0], &factors
);
1322 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323 number composite. */
1324 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1327 uintmax_t e
[2], y
[2];
1329 if (flag_prove_primality
)
1332 if (factors
.plarge
[1])
1335 binv (pi
, factors
.plarge
[0]);
1338 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1339 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1341 for (int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1343 /* FIXME: We always have the factor 2. Do we really need to
1344 handle it here? We have done the same powering as part
1346 if (factors
.p
[i
] == 2)
1347 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1349 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1350 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1351 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1356 /* After enough Miller-Rabin runs, be content. */
1357 is_prime
= (r
== MR_REPS
- 1);
1363 a
+= primes_diff
[r
]; /* Establish new base. */
1364 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1366 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1370 affirm (!"Lucas prime test failure. This should not happen");
1374 mp_prime_p (mpz_t n
)
1377 mpz_t q
, a
, nm1
, tmp
;
1378 struct mp_factors factors
;
1380 if (mpz_cmp_ui (n
, 1) <= 0)
1383 /* We have already casted out small primes. */
1384 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1387 mpz_inits (q
, a
, nm1
, tmp
, nullptr);
1389 /* Precomputation for Miller-Rabin. */
1390 mpz_sub_ui (nm1
, n
, 1);
1392 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1393 mp_bitcnt_t k
= mpz_scan1 (nm1
, 0);
1394 mpz_tdiv_q_2exp (q
, nm1
, k
);
1398 /* Perform a Miller-Rabin test, finds most composites quickly. */
1399 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1405 if (flag_prove_primality
)
1407 /* Factor n-1 for Lucas. */
1409 mp_factor (tmp
, &factors
);
1412 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1413 number composite. */
1414 for (idx_t r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1416 if (flag_prove_primality
)
1419 for (idx_t i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1421 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1422 mpz_powm (tmp
, a
, tmp
, n
);
1423 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1428 /* After enough Miller-Rabin runs, be content. */
1429 is_prime
= (r
== MR_REPS
- 1);
1435 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1437 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1444 affirm (!"Lucas prime test failure. This should not happen");
1447 if (flag_prove_primality
)
1448 mp_factor_clear (&factors
);
1450 mpz_clears (q
, a
, nm1
, tmp
, nullptr);
1456 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1457 struct factors
*factors
)
1459 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1461 unsigned long int k
= 1;
1462 unsigned long int l
= 1;
1465 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1472 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1478 x
= mulredc (x
, x
, n
, ni
);
1479 addmod (x
, x
, a
, n
);
1481 submod (t
, z
, x
, n
);
1482 P
= mulredc (P
, t
, n
, ni
);
1486 if (gcd_odd (P
, n
) != 1)
1496 for (unsigned long int i
= 0; i
< k
; i
++)
1498 x
= mulredc (x
, x
, n
, ni
);
1499 addmod (x
, x
, a
, n
);
1507 y
= mulredc (y
, y
, n
, ni
);
1508 addmod (y
, y
, a
, n
);
1510 submod (t
, z
, y
, n
);
1517 /* Found n itself as factor. Restart with different params. */
1518 factor_using_pollard_rho (n
, a
+ 1, factors
);
1525 factor_using_pollard_rho (g
, a
+ 1, factors
);
1527 factor_insert (factors
, g
);
1531 factor_insert (factors
, n
);
1542 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1543 struct factors
*factors
)
1545 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1547 unsigned long int k
= 1;
1548 unsigned long int l
= 1;
1550 redcify2 (P1
, P0
, 1, n1
, n0
);
1551 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1555 while (n1
!= 0 || n0
!= 1)
1563 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1565 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1567 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1568 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1573 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1574 if (g1
!= 0 || g0
!= 1)
1584 for (unsigned long int i
= 0; i
< k
; i
++)
1586 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1588 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1596 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1598 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1600 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1601 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1603 while (g1
== 0 && g0
== 1);
1607 /* The found factor is one word, and > 1. */
1608 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1611 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1613 factor_insert (factors
, g0
);
1617 /* The found factor is two words. This is highly unlikely, thus hard
1618 to trigger. Please be careful before you change this code! */
1621 if (n1
== g1
&& n0
== g0
)
1623 /* Found n itself as factor. Restart with different params. */
1624 factor_using_pollard_rho2 (n1
, n0
, a
+ 1, factors
);
1628 /* Compute n = n / g. Since the result will fit one word,
1629 we can compute the quotient modulo B, ignoring the high
1635 if (!prime2_p (g1
, g0
))
1636 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1638 factor_insert_large (factors
, g1
, g0
);
1645 factor_insert (factors
, n0
);
1649 factor_using_pollard_rho (n0
, a
, factors
);
1653 if (prime2_p (n1
, n0
))
1655 factor_insert_large (factors
, n1
, n0
);
1659 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1660 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1661 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1666 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1667 struct mp_factors
*factors
)
1672 devmsg ("[pollard-rho (%lu)] ", a
);
1674 mpz_inits (t
, t2
, nullptr);
1675 mpz_init_set_si (y
, 2);
1676 mpz_init_set_si (x
, 2);
1677 mpz_init_set_si (z
, 2);
1678 mpz_init_set_ui (P
, 1);
1680 unsigned long long int k
= 1;
1681 unsigned long long int l
= 1;
1683 while (mpz_cmp_ui (n
, 1) != 0)
1691 mpz_add_ui (x
, x
, a
);
1700 if (mpz_cmp_ui (t
, 1) != 0)
1710 for (unsigned long long int i
= 0; i
< k
; i
++)
1714 mpz_add_ui (x
, x
, a
);
1724 mpz_add_ui (y
, y
, a
);
1729 while (mpz_cmp_ui (t
, 1) == 0);
1731 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1733 if (!mp_prime_p (t
))
1735 devmsg ("[composite factor--restarting pollard-rho] ");
1736 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1740 mp_factor_insert (factors
, t
);
1745 mp_factor_insert (factors
, n
);
1754 mpz_clears (P
, t2
, t
, z
, x
, y
, nullptr);
1758 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1759 algorithm is replaced, consider also returning the remainder. */
1769 count_leading_zeros (c
, n
);
1771 /* Make x > sqrt(n). This will be invariant through the loop. */
1772 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) >> 1);
1776 uintmax_t y
= (x
+ n
/ x
) / 2;
1786 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1791 /* Ensures the remainder fits in an uintmax_t. */
1792 affirm (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1797 count_leading_zeros (shift
, nh
);
1800 /* Make x > sqrt (n). */
1801 x
= isqrt ((nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1802 x
<<= (W_TYPE_SIZE
- shift
) >> 1;
1804 /* Do we need more than one iteration? */
1807 MAYBE_UNUSED
uintmax_t r
;
1809 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1815 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1816 affirm (gt2 (hi
, lo
, nh
, nl
));
1818 umul_ppmm (hi
, lo
, x
, x
);
1819 affirm (ge2 (nh
, nl
, hi
, lo
));
1820 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1830 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1831 # define MAGIC64 0x0202021202030213ULL
1832 # define MAGIC63 0x0402483012450293ULL
1833 # define MAGIC65 0x218a019866014613ULL
1834 # define MAGIC11 0x23b
1836 /* Return the square root if the input is a square, otherwise 0. */
1839 is_square (uintmax_t x
)
1841 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1842 computing the square root. */
1843 if (((MAGIC64
>> (x
& 63)) & 1)
1844 && ((MAGIC63
>> (x
% 63)) & 1)
1845 /* Both 0 and 64 are squares mod (65). */
1846 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1847 && ((MAGIC11
>> (x
% 11) & 1)))
1849 uintmax_t r
= isqrt (x
);
1856 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1857 static short const invtab
[0x81] =
1860 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1861 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1862 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1863 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1864 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1865 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1866 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1867 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1868 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1869 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1870 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1871 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1872 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1873 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1874 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1875 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1878 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1879 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1880 # define div_smallq(q, r, u, d) \
1882 if ((u) / 0x40 < (d)) \
1885 uintmax_t _dinv, _mask, _q, _r; \
1886 count_leading_zeros (_cnt, (d)); \
1888 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1890 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1891 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1895 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1896 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1900 _mask = -(uintmax_t) (_r >= (d)); \
1901 (r) = _r - (_mask & (d)); \
1903 affirm ((q) * (d) + (r) == u); \
1907 uintmax_t _q = (u) / (d); \
1908 (r) = (u) - _q * (d); \
1913 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1914 = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1917 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1918 representing G_0 = (-55, 2 * 4652, 8653).
1920 In the notation of the paper:
1922 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1926 t_0 = floor([q_0 + R_0] / S0) = 1
1927 R_1 = t_0 * S_0 - R_0 = 4001
1928 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1931 /* Multipliers, in order of efficiency:
1932 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1933 0.7317 3*5*7 = 105 = 1
1934 0.7820 3*5*11 = 165 = 1
1936 0.8101 3*7*11 = 231 = 3
1938 0.8284 5*7*11 = 385 = 1
1940 0.8716 3*11 = 33 = 1
1942 0.8913 5*11 = 55 = 3
1944 0.9233 7*11 = 77 = 1
1948 # define QUEUE_SIZE 50
1952 # define Q_FREQ_SIZE 50
1953 /* Element 0 keeps the total */
1954 static int q_freq
[Q_FREQ_SIZE
+ 1];
1958 /* Return true on success. Expected to fail only for numbers
1959 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1961 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1963 /* Uses algorithm and notation from
1965 SQUARE FORM FACTORIZATION
1966 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1968 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1971 static short const multipliers_1
[] =
1973 105, 165, 21, 385, 33, 5, 77, 1, 0
1975 static short const multipliers_3
[] =
1977 1155, 15, 231, 35, 3, 55, 7, 11, 0
1980 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1982 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1985 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1987 if (n0
== sqrt_n
* sqrt_n
)
1991 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
1996 if (prime_p (sqrt_n
))
1997 factor_insert_multiplicity (factors
, sqrt_n
, 2);
2003 if (!factor_using_squfof (0, sqrt_n
, &f
))
2005 /* Try pollard rho instead */
2006 factor_using_pollard_rho (sqrt_n
, 1, &f
);
2008 /* Duplicate the new factors */
2009 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
2010 factor_insert_multiplicity (factors
, f
.p
[i
], 2 * f
.e
[i
]);
2016 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2017 for (short const *m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2020 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2022 unsigned int mu
= *m
;
2025 affirm (mu
* n0
% 4 == 3);
2027 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2028 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2029 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2032 However, this seems insufficient: With n = 37243139 and mu =
2033 105, we get a trivial factor, from the square 38809 = 197^2,
2034 without any corresponding Q earlier in the iteration.
2036 Requiring 64 mu^3 < n seems sufficient. */
2039 if ((uintmax_t) mu
* mu
* mu
>= n0
/ 64)
2044 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2047 umul_ppmm (Dh
, Dl
, n0
, mu
);
2050 affirm (Dl
% 4 != 1);
2051 affirm (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2053 S
= isqrt2 (Dh
, Dl
);
2058 /* Square root remainder fits in one word, so ignore high part. */
2060 /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))? */
2065 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2068 for (i
= 0; i
<= B
; i
++)
2070 uintmax_t q
, P1
, t
, rem
;
2072 div_smallq (q
, rem
, S
+ P
, Q
);
2073 P1
= S
- rem
; /* P1 = q*Q - P */
2075 affirm (q
> 0 && Q
> 0);
2079 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2089 g
/= gcd_odd (g
, mu
);
2093 if (qpos
>= QUEUE_SIZE
)
2094 error (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2096 queue
[qpos
].P
= P
% g
;
2101 /* I think the difference can be either sign, but mod
2102 2^W_TYPE_SIZE arithmetic should be fine. */
2103 t
= Q1
+ q
* (P
- P1
);
2110 uintmax_t r
= is_square (Q
);
2113 for (int j
= 0; j
< qpos
; j
++)
2115 if (queue
[j
].Q
== r
)
2118 /* Traversed entire cycle. */
2119 goto next_multiplier
;
2121 /* Need the absolute value for divisibility test. */
2122 if (P
>= queue
[j
].P
)
2128 /* Delete entries up to and including entry
2129 j, which matched. */
2130 memmove (queue
, queue
+ j
+ 1,
2131 (qpos
- j
- 1) * sizeof (queue
[0]));
2138 /* We have found a square form, which should give a
2141 affirm (S
>= P
); /* What signs are possible? */
2142 P
+= r
* ((S
- P
) / r
);
2144 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2145 for the case D = 2N. */
2146 /* Compute Q = (D - P*P) / Q1, but we need double
2149 umul_ppmm (hi
, lo
, P
, P
);
2150 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2151 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2156 /* Note: There appears to by a typo in the paper,
2157 Step 4a in the algorithm description says q <--
2158 floor([S+P]/\hat Q), but looking at the equations
2159 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2160 (In this code, \hat Q is Q1). */
2161 div_smallq (q
, rem
, S
+ P
, Q
);
2162 P1
= S
- rem
; /* P1 = q*Q - P */
2166 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2170 t
= Q1
+ q
* (P
- P1
);
2178 Q
/= gcd_odd (Q
, mu
);
2180 affirm (Q
> 1 && (n1
|| Q
< n0
));
2183 factor_insert (factors
, Q
);
2184 else if (!factor_using_squfof (0, Q
, factors
))
2185 factor_using_pollard_rho (Q
, 2, factors
);
2187 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2189 if (prime2_p (n1
, n0
))
2190 factor_insert_large (factors
, n1
, n0
);
2193 if (!factor_using_squfof (n1
, n0
, factors
))
2196 factor_using_pollard_rho (n0
, 1, factors
);
2198 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2213 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2214 results in FACTORS. */
2216 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2218 factors
->nfactors
= 0;
2219 factors
->plarge
[1] = 0;
2221 if (t1
== 0 && t0
< 2)
2224 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2226 if (t1
== 0 && t0
< 2)
2229 if (prime2_p (t1
, t0
))
2230 factor_insert_large (factors
, t1
, t0
);
2234 if (factor_using_squfof (t1
, t0
, factors
))
2239 factor_using_pollard_rho (t0
, 1, factors
);
2241 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2245 /* Use Pollard-rho to compute the prime factors of
2246 arbitrary-precision T, and put the results in FACTORS. */
2248 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2250 mp_factor_init (factors
);
2252 if (mpz_sgn (t
) != 0)
2254 mp_factor_using_division (t
, factors
);
2256 if (mpz_cmp_ui (t
, 1) != 0)
2258 devmsg ("[is number prime?] ");
2260 mp_factor_insert (factors
, t
);
2262 mp_factor_using_pollard_rho (t
, 1, factors
);
2268 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, char const *s
)
2271 uintmax_t hi
= 0, lo
= 0;
2273 strtol_error err
= LONGINT_INVALID
;
2275 /* Initial scan for invalid digits. */
2279 unsigned char c
= *p
++;
2283 if (UNLIKELY (!ISDIGIT (c
)))
2285 err
= LONGINT_INVALID
;
2289 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2292 while (err
== LONGINT_OK
)
2294 unsigned char c
= *s
++;
2300 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2302 err
= LONGINT_OVERFLOW
;
2307 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2308 lo_carry
+= 10 * lo
< 2 * lo
;
2315 if (UNLIKELY (hi
< lo_carry
))
2317 err
= LONGINT_OVERFLOW
;
2328 /* Structure and routines for buffering and outputting full lines,
2329 to support parallel operation efficiently. */
2336 /* 512 is chosen to give good performance,
2337 and also is the max guaranteed size that
2338 consumers can read atomically through pipes.
2339 Also it's big enough to cater for max line length
2340 even with 128 bit uintmax_t. */
2341 #define FACTOR_PIPE_BUF 512
2349 /* Double to ensure enough space for
2350 previous numbers + next number. */
2351 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2352 lbuf
.end
= lbuf
.buf
;
2355 /* Write complete LBUF to standard output. */
2359 size_t size
= lbuf
.end
- lbuf
.buf
;
2360 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2362 lbuf
.end
= lbuf
.buf
;
2365 /* Add a character C to LBUF and if it's a newline
2366 and enough bytes are already buffered,
2367 then write atomically to standard output. */
2375 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2377 /* Provide immediate output for interactive use. */
2378 static int line_buffered
= -1;
2379 if (line_buffered
== -1)
2380 line_buffered
= isatty (STDIN_FILENO
) || isatty (STDOUT_FILENO
);
2383 else if (buffered
>= FACTOR_PIPE_BUF
)
2385 /* Write output in <= PIPE_BUF chunks
2386 so consumers can read atomically. */
2387 char const *tend
= lbuf
.end
;
2389 /* Since a umaxint_t's factors must fit in 512
2390 we're guaranteed to find a newline here. */
2391 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2392 while (*--tlend
!= '\n');
2398 /* Buffer the remainder. */
2399 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2400 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2405 /* Buffer an int to the internal LBUF. */
2407 lbuf_putint (uintmax_t i
, size_t min_width
)
2409 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2410 char const *umaxstr
= umaxtostr (i
, buf
);
2411 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2414 for (; z
< min_width
; z
++)
2417 memcpy (lbuf
.end
, umaxstr
, width
);
2422 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2427 lbuf_putint (t0
, 0);
2430 /* Use very plain code here since it seems hard to write fast code
2431 without assuming a specific word size. */
2432 q
= t1
/ 1000000000;
2433 r
= t1
% 1000000000;
2434 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2435 print_uintmaxes (q
, t0
);
2440 /* Single-precision factoring */
2442 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2444 struct factors factors
;
2446 print_uintmaxes (t1
, t0
);
2449 factor (t1
, t0
, &factors
);
2451 for (int j
= 0; j
< factors
.nfactors
; j
++)
2452 for (int k
= 0; k
< factors
.e
[j
]; k
++)
2455 print_uintmaxes (0, factors
.p
[j
]);
2456 if (print_exponents
&& factors
.e
[j
] > 1)
2459 lbuf_putint (factors
.e
[j
], 0);
2464 if (factors
.plarge
[1])
2467 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2473 /* Emit the factors of the indicated number. If we have the option of using
2474 either algorithm, we select on the basis of the length of the number.
2475 For longer numbers, we prefer the MP algorithm even if the native algorithm
2476 has enough digits, because the algorithm is better. The turnover point
2477 depends on the value. */
2479 print_factors (char const *input
)
2481 /* Skip initial spaces and '+'. */
2482 char const *str
= input
;
2489 /* Try converting the number to one or two words. If it fails, use GMP or
2490 print an error message. The 2nd condition checks that the most
2491 significant bit of the two-word number is clear, in a typesize neutral
2493 strtol_error err
= strto2uintmax (&t1
, &t0
, str
);
2498 if (((t1
<< 1) >> 1) == t1
)
2500 devmsg ("[using single-precision arithmetic] ");
2501 print_factors_single (t1
, t0
);
2506 case LONGINT_OVERFLOW
:
2511 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2515 devmsg ("[using arbitrary-precision arithmetic] ");
2517 struct mp_factors factors
;
2519 mpz_init_set_str (t
, str
, 10);
2521 mpz_out_str (stdout
, 10, t
);
2523 mp_factor (t
, &factors
);
2525 for (idx_t j
= 0; j
< factors
.nfactors
; j
++)
2526 for (unsigned long int k
= 0; k
< factors
.e
[j
]; k
++)
2529 mpz_out_str (stdout
, 10, factors
.p
[j
]);
2530 if (print_exponents
&& factors
.e
[j
] > 1)
2532 printf ("^%lu", factors
.e
[j
]);
2537 mp_factor_clear (&factors
);
2547 if (status
!= EXIT_SUCCESS
)
2552 Usage: %s [OPTION] [NUMBER]...\n\
2556 Print the prime factors of each specified integer NUMBER. If none\n\
2557 are specified on the command line, read them from standard input.\n\
2561 -h, --exponents print repeated factors in form p^e unless e is 1\n\
2563 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2564 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2565 emit_ancillary_info (PROGRAM_NAME
);
2574 token_buffer tokenbuffer
;
2576 init_tokenbuffer (&tokenbuffer
);
2580 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2582 if (token_length
== (size_t) -1)
2585 error (EXIT_FAILURE
, errno
, _("error reading input"));
2589 ok
&= print_factors (tokenbuffer
.buffer
);
2591 free (tokenbuffer
.buffer
);
2597 main (int argc
, char **argv
)
2599 initialize_main (&argc
, &argv
);
2600 set_program_name (argv
[0]);
2601 setlocale (LC_ALL
, "");
2602 bindtextdomain (PACKAGE
, LOCALEDIR
);
2603 textdomain (PACKAGE
);
2606 atexit (close_stdout
);
2607 atexit (lbuf_flush
);
2610 while ((c
= getopt_long (argc
, argv
, "h", long_options
, nullptr)) != -1)
2614 case 'h': /* NetBSD used -h for this functionality first. */
2615 print_exponents
= true;
2618 case DEV_DEBUG_OPTION
:
2622 case_GETOPT_HELP_CHAR
;
2624 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2627 usage (EXIT_FAILURE
);
2632 memset (q_freq
, 0, sizeof (q_freq
));
2641 for (int i
= optind
; i
< argc
; i
++)
2642 if (! print_factors (argv
[i
]))
2650 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2651 for (int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2653 double f
= (double) q_freq
[i
] / q_freq
[0];
2655 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2656 100.0 * f
, 100.0 * acc_f
);
2661 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;