1 /* factor -- print prime factors of n.
2 Copyright (C) 1986-2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Originally written by Paul Rubin <phr@ocf.berkeley.edu>.
18 Adapted for GNU, fixed to factor UINT_MAX by Jim Meyering.
19 Arbitrary-precision code adapted by James Youngman from Torbjörn
20 Granlund's factorize.c, from GNU MP version 4.2.2.
21 In 2012, the core was rewritten by Torbjörn Granlund and Niels Möller.
22 Contains code from GNU MP. */
24 /* Efficiently factor numbers that fit in one or two words (word = uintmax_t),
25 or, with GMP, numbers of any size.
29 There are several variants of many functions, for handling one word, two
30 words, and GMP's mpz_t type. If the one-word variant is called foo, the
31 two-word variant will be foo2, and the one for mpz_t will be mp_foo. In
32 some cases, the plain function variants will handle both one-word and
33 two-word numbers, evidenced by function arguments.
35 The factoring code for two words will fall into the code for one word when
40 (1) Perform trial division using a small primes table, but without hardware
41 division since the primes table store inverses modulo the word base.
42 (The GMP variant of this code doesn't make use of the precomputed
43 inverses, but instead relies on GMP for fast divisibility testing.)
44 (2) Check the nature of any non-factored part using Miller-Rabin for
45 detecting composites, and Lucas for detecting primes.
46 (3) Factor any remaining composite part using the Pollard-Brent rho
47 algorithm or if USE_SQUFOF is defined to 1, try that first.
48 Status of found factors are checked again using Miller-Rabin and Lucas.
50 We prefer using Hensel norm in the divisions, not the more familiar
51 Euclidian norm, since the former leads to much faster code. In the
52 Pollard-Brent rho code and the prime testing code, we use Montgomery's
53 trick of multiplying all n-residues by the word base, allowing cheap Hensel
56 The GMP code uses an algorithm that can be considerably slower;
57 for example, on a circa-2017 Intel Xeon Silver 4116, factoring
58 2^{127}-3 takes about 50 ms with the two-word algorithm but would
59 take about 750 ms with the GMP code.
63 * Use modular inverses also for exact division in the Lucas code, and
64 elsewhere. A problem is to locate the inverses not from an index, but
65 from a prime. We might instead compute the inverse on-the-fly.
67 * Tune trial division table size (not forgetting that this is a standalone
68 program where the table will be read from secondary storage for
71 * Implement less naive powm, using k-ary exponentiation for k = 3 or
74 * Try to speed trial division code for single uintmax_t numbers, i.e., the
75 code using DIVBLOCK. It currently runs at 2 cycles per prime (Intel SBR,
76 IBR), 3 cycles per prime (AMD Stars) and 5 cycles per prime (AMD BD) when
77 using gcc 4.6 and 4.7. Some software pipelining should help; 1, 2, and 4
78 respectively cycles ought to be possible.
80 * The redcify function could be vastly improved by using (plain Euclidian)
81 pre-inversion (such as GMP's invert_limb) and udiv_qrnnd_preinv (from
82 GMP's gmp-impl.h). The redcify2 function could be vastly improved using
83 similar methoods. These functions currently dominate run time when using
87 /* Whether to recursively factor to prove primality,
88 or run faster probabilistic tests. */
89 #ifndef PROVE_PRIMALITY
90 # define PROVE_PRIMALITY 1
93 /* Faster for certain ranges but less general. */
98 /* Output SQUFOF statistics. */
100 # define STAT_SQUFOF 0
113 #include "full-write.h"
115 #include "readtokens.h"
118 /* The official name of this program (e.g., no 'g' prefix). */
119 #define PROGRAM_NAME "factor"
122 proper_name ("Paul Rubin"), \
123 proper_name_utf8 ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
124 proper_name_utf8 ("Niels Moller", "Niels M\303\266ller")
126 /* Token delimiters when reading from a file. */
127 #define DELIM "\n\t "
129 #ifndef USE_LONGLONG_H
130 /* With the way we use longlong.h, it's only safe to use
131 when UWtype = UHWtype, as there were various cases
132 (as can be seen in the history for longlong.h) where
133 for example, _LP64 was required to enable W_TYPE_SIZE==64 code,
134 to avoid compile time or run time issues. */
135 # if LONG_MAX == INTMAX_MAX
136 # define USE_LONGLONG_H 1
142 /* Make definitions for longlong.h to make it do what it can do for us */
144 /* bitcount for uintmax_t */
145 # if UINTMAX_MAX == UINT32_MAX
146 # define W_TYPE_SIZE 32
147 # elif UINTMAX_MAX == UINT64_MAX
148 # define W_TYPE_SIZE 64
149 # elif UINTMAX_MAX == UINT128_MAX
150 # define W_TYPE_SIZE 128
153 # define UWtype uintmax_t
154 # define UHWtype unsigned long int
156 # if HAVE_ATTRIBUTE_MODE
157 typedef unsigned int UQItype
__attribute__ ((mode (QI
)));
158 typedef int SItype
__attribute__ ((mode (SI
)));
159 typedef unsigned int USItype
__attribute__ ((mode (SI
)));
160 typedef int DItype
__attribute__ ((mode (DI
)));
161 typedef unsigned int UDItype
__attribute__ ((mode (DI
)));
163 typedef unsigned char UQItype
;
165 typedef unsigned long int USItype
;
166 # if HAVE_LONG_LONG_INT
167 typedef long long int DItype
;
168 typedef unsigned long long int UDItype
;
169 # else /* Assume `long' gives us a wide enough type. Needed for hppa2.0w. */
170 typedef long int DItype
;
171 typedef unsigned long int UDItype
;
174 # define LONGLONG_STANDALONE /* Don't require GMP's longlong.h mdep files */
175 # define ASSERT(x) /* FIXME make longlong.h really standalone */
176 # define __GMP_DECLSPEC /* FIXME make longlong.h really standalone */
177 # define __clz_tab factor_clz_tab /* Rename to avoid glibc collision */
178 # ifndef __GMP_GNUC_PREREQ
179 # define __GMP_GNUC_PREREQ(a,b) 1
182 /* These stub macros are only used in longlong.h in certain system compiler
183 combinations, so ensure usage to avoid -Wunused-macros warnings. */
184 # if __GMP_GNUC_PREREQ (1,1) && defined __clz_tab
190 # define HAVE_HOST_CPU_FAMILY_powerpc 1
192 # include "longlong.h"
193 # ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
194 const unsigned char factor_clz_tab
[129] =
196 1,2,3,3,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
197 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
198 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
199 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
204 #else /* not USE_LONGLONG_H */
206 # define W_TYPE_SIZE (8 * sizeof (uintmax_t))
207 # define __ll_B ((uintmax_t) 1 << (W_TYPE_SIZE / 2))
208 # define __ll_lowpart(t) ((uintmax_t) (t) & (__ll_B - 1))
209 # define __ll_highpart(t) ((uintmax_t) (t) >> (W_TYPE_SIZE / 2))
213 #if !defined __clz_tab && !defined UHWtype
214 /* Without this seemingly useless conditional, gcc -Wunused-macros
215 warns that each of the two tested macros is unused on Fedora 18.
216 FIXME: this is just an ugly band-aid. Fix it properly. */
219 /* 2*3*5*7*11...*101 is 128 bits, and has 26 prime factors */
220 #define MAX_NFACTS 26
224 DEV_DEBUG_OPTION
= CHAR_MAX
+ 1
227 static struct option
const long_options
[] =
229 {"-debug", no_argument
, NULL
, DEV_DEBUG_OPTION
},
230 {GETOPT_HELP_OPTION_DECL
},
231 {GETOPT_VERSION_OPTION_DECL
},
237 uintmax_t plarge
[2]; /* Can have a single large factor */
238 uintmax_t p
[MAX_NFACTS
];
239 unsigned char e
[MAX_NFACTS
];
240 unsigned char nfactors
;
246 unsigned long int *e
;
247 unsigned long int nfactors
;
250 static void factor (uintmax_t, uintmax_t, struct factors
*);
253 # define umul_ppmm(w1, w0, u, v) \
255 uintmax_t __x0, __x1, __x2, __x3; \
256 unsigned long int __ul, __vl, __uh, __vh; \
257 uintmax_t __u = (u), __v = (v); \
259 __ul = __ll_lowpart (__u); \
260 __uh = __ll_highpart (__u); \
261 __vl = __ll_lowpart (__v); \
262 __vh = __ll_highpart (__v); \
264 __x0 = (uintmax_t) __ul * __vl; \
265 __x1 = (uintmax_t) __ul * __vh; \
266 __x2 = (uintmax_t) __uh * __vl; \
267 __x3 = (uintmax_t) __uh * __vh; \
269 __x1 += __ll_highpart (__x0);/* This can't give carry. */ \
270 __x1 += __x2; /* But this indeed can. */ \
271 if (__x1 < __x2) /* Did we get it? */ \
272 __x3 += __ll_B; /* Yes, add it in the proper pos. */ \
274 (w1) = __x3 + __ll_highpart (__x1); \
275 (w0) = (__x1 << W_TYPE_SIZE / 2) + __ll_lowpart (__x0); \
279 #if !defined udiv_qrnnd || defined UDIV_NEEDS_NORMALIZATION
280 /* Define our own, not needing normalization. This function is
281 currently not performance critical, so keep it simple. Similar to
282 the mod macro below. */
284 # define udiv_qrnnd(q, r, n1, n0, d) \
286 uintmax_t __d1, __d0, __q, __r1, __r0; \
288 assert ((n1) < (d)); \
289 __d1 = (d); __d0 = 0; \
290 __r1 = (n1); __r0 = (n0); \
292 for (unsigned int __i = W_TYPE_SIZE; __i > 0; __i--) \
294 rsh2 (__d1, __d0, __d1, __d0, 1); \
296 if (ge2 (__r1, __r0, __d1, __d0)) \
299 sub_ddmmss (__r1, __r0, __r1, __r0, __d1, __d0); \
307 #if !defined add_ssaaaa
308 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \
311 _add_x = (al) + (bl); \
312 (sh) = (ah) + (bh) + (_add_x < (al)); \
317 #define rsh2(rh, rl, ah, al, cnt) \
319 (rl) = ((ah) << (W_TYPE_SIZE - (cnt))) | ((al) >> (cnt)); \
320 (rh) = (ah) >> (cnt); \
323 #define lsh2(rh, rl, ah, al, cnt) \
325 (rh) = ((ah) << cnt) | ((al) >> (W_TYPE_SIZE - (cnt))); \
326 (rl) = (al) << (cnt); \
329 #define ge2(ah, al, bh, bl) \
330 ((ah) > (bh) || ((ah) == (bh) && (al) >= (bl)))
332 #define gt2(ah, al, bh, bl) \
333 ((ah) > (bh) || ((ah) == (bh) && (al) > (bl)))
336 # define sub_ddmmss(rh, rl, ah, al, bh, bl) \
340 (rl) = (al) - (bl); \
341 (rh) = (ah) - (bh) - _cy; \
345 #ifndef count_leading_zeros
346 # define count_leading_zeros(count, x) do { \
347 uintmax_t __clz_x = (x); \
348 unsigned int __clz_c; \
350 (__clz_x & ((uintmax_t) 0xff << (W_TYPE_SIZE - 8))) == 0; \
353 for (; (intmax_t)__clz_x >= 0; __clz_c++) \
359 #ifndef count_trailing_zeros
360 # define count_trailing_zeros(count, x) do { \
361 uintmax_t __ctz_x = (x); \
362 unsigned int __ctz_c = 0; \
363 while ((__ctz_x & 1) == 0) \
372 /* Requires that a < n and b <= n */
373 #define submod(r,a,b,n) \
375 uintmax_t _t = - (uintmax_t) (a < b); \
376 (r) = ((n) & _t) + (a) - (b); \
379 #define addmod(r,a,b,n) \
380 submod ((r), (a), ((n) - (b)), (n))
382 /* Modular two-word addition and subtraction. For performance reasons, the
383 most significant bit of n1 must be clear. The destination variables must be
384 distinct from the mod operand. */
385 #define addmod2(r1, r0, a1, a0, b1, b0, n1, n0) \
387 add_ssaaaa ((r1), (r0), (a1), (a0), (b1), (b0)); \
388 if (ge2 ((r1), (r0), (n1), (n0))) \
389 sub_ddmmss ((r1), (r0), (r1), (r0), (n1), (n0)); \
391 #define submod2(r1, r0, a1, a0, b1, b0, n1, n0) \
393 sub_ddmmss ((r1), (r0), (a1), (a0), (b1), (b0)); \
394 if ((intmax_t) (r1) < 0) \
395 add_ssaaaa ((r1), (r0), (r1), (r0), (n1), (n0)); \
398 #define HIGHBIT_TO_MASK(x) \
399 (((intmax_t)-1 >> 1) < 0 \
400 ? (uintmax_t)((intmax_t)(x) >> (W_TYPE_SIZE - 1)) \
401 : ((x) & ((uintmax_t) 1 << (W_TYPE_SIZE - 1)) \
402 ? UINTMAX_MAX : (uintmax_t) 0))
404 /* Compute r = a mod d, where r = <*t1,retval>, a = <a1,a0>, d = <d1,d0>.
405 Requires that d1 != 0. */
407 mod2 (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t d1
, uintmax_t d0
)
419 count_leading_zeros (cntd
, d1
);
420 count_leading_zeros (cnta
, a1
);
421 int cnt
= cntd
- cnta
;
422 lsh2 (d1
, d0
, d1
, d0
, cnt
);
423 for (int i
= 0; i
< cnt
; i
++)
425 if (ge2 (a1
, a0
, d1
, d0
))
426 sub_ddmmss (a1
, a0
, a1
, a0
, d1
, d0
);
427 rsh2 (d1
, d0
, d1
, d0
, 1);
436 gcd_odd (uintmax_t a
, uintmax_t b
)
447 /* Take out least significant one bit, to make room for sign */
463 bgta
= HIGHBIT_TO_MASK (t
);
465 /* b <-- min (a, b) */
469 a
= (t
^ bgta
) - bgta
;
474 gcd2_odd (uintmax_t *r1
, uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
)
484 while ((a0
& 1) == 0)
485 rsh2 (a1
, a0
, a1
, a0
, 1);
492 return gcd_odd (b0
, a0
);
495 if (gt2 (a1
, a0
, b1
, b0
))
497 sub_ddmmss (a1
, a0
, a1
, a0
, b1
, b0
);
499 rsh2 (a1
, a0
, a1
, a0
, 1);
500 while ((a0
& 1) == 0);
502 else if (gt2 (b1
, b0
, a1
, a0
))
504 sub_ddmmss (b1
, b0
, b1
, b0
, a1
, a0
);
506 rsh2 (b1
, b0
, b1
, b0
, 1);
507 while ((b0
& 1) == 0);
518 factor_insert_multiplicity (struct factors
*factors
,
519 uintmax_t prime
, unsigned int m
)
521 unsigned int nfactors
= factors
->nfactors
;
522 uintmax_t *p
= factors
->p
;
523 unsigned char *e
= factors
->e
;
525 /* Locate position for insert new or increment e. */
527 for (i
= nfactors
- 1; i
>= 0; i
--)
533 if (i
< 0 || p
[i
] != prime
)
535 for (int j
= nfactors
- 1; j
> i
; j
--)
542 factors
->nfactors
= nfactors
+ 1;
550 #define factor_insert(f, p) factor_insert_multiplicity (f, p, 1)
553 factor_insert_large (struct factors
*factors
,
554 uintmax_t p1
, uintmax_t p0
)
558 assert (factors
->plarge
[1] == 0);
559 factors
->plarge
[0] = p0
;
560 factors
->plarge
[1] = p1
;
563 factor_insert (factors
, p0
);
570 # define mpz_inits(...) mpz_va_init (mpz_init, __VA_ARGS__)
571 # define mpz_clears(...) mpz_va_init (mpz_clear, __VA_ARGS__)
574 mpz_va_init (void (*mpz_single_init
)(mpz_t
), ...)
578 va_start (ap
, mpz_single_init
);
581 while ((mpz
= va_arg (ap
, mpz_t
*)))
582 mpz_single_init (*mpz
);
588 static void mp_factor (mpz_t
, struct mp_factors
*);
591 mp_factor_init (struct mp_factors
*factors
)
595 factors
->nfactors
= 0;
599 mp_factor_clear (struct mp_factors
*factors
)
601 for (unsigned int i
= 0; i
< factors
->nfactors
; i
++)
602 mpz_clear (factors
->p
[i
]);
609 mp_factor_insert (struct mp_factors
*factors
, mpz_t prime
)
611 unsigned long int nfactors
= factors
->nfactors
;
612 mpz_t
*p
= factors
->p
;
613 unsigned long int *e
= factors
->e
;
616 /* Locate position for insert new or increment e. */
617 for (i
= nfactors
- 1; i
>= 0; i
--)
619 if (mpz_cmp (p
[i
], prime
) <= 0)
623 if (i
< 0 || mpz_cmp (p
[i
], prime
) != 0)
625 p
= xrealloc (p
, (nfactors
+ 1) * sizeof p
[0]);
626 e
= xrealloc (e
, (nfactors
+ 1) * sizeof e
[0]);
628 mpz_init (p
[nfactors
]);
629 for (long j
= nfactors
- 1; j
> i
; j
--)
631 mpz_set (p
[j
+ 1], p
[j
]);
634 mpz_set (p
[i
+ 1], prime
);
639 factors
->nfactors
= nfactors
+ 1;
648 mp_factor_insert_ui (struct mp_factors
*factors
, unsigned long int prime
)
652 mpz_init_set_ui (pz
, prime
);
653 mp_factor_insert (factors
, pz
);
658 /* Number of bits in an uintmax_t. */
659 enum { W
= sizeof (uintmax_t) * CHAR_BIT
};
661 /* Verify that uintmax_t does not have holes in its representation. */
662 verify (UINTMAX_MAX
>> (W
- 1) == 1);
664 #define P(a,b,c,d) a,
665 static const unsigned char primes_diff
[] = {
667 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
671 #define PRIMES_PTAB_ENTRIES \
672 (sizeof (primes_diff) / sizeof (primes_diff[0]) - 8 + 1)
674 #define P(a,b,c,d) b,
675 static const unsigned char primes_diff8
[] = {
677 0,0,0,0,0,0,0 /* 7 sentinels for 8-way loop */
686 #define P(a,b,c,d) {c,d},
687 static const struct primes_dtab primes_dtab
[] = {
689 {1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0} /* 7 sentinels for 8-way loop */
693 /* Verify that uintmax_t is not wider than
694 the integers used to generate primes.h. */
695 verify (W
<= WIDE_UINT_BITS
);
697 /* debugging for developers. Enables devmsg().
698 This flag is used only in the GMP code. */
699 static bool dev_debug
= false;
701 /* Prove primality or run probabilistic tests. */
702 static bool flag_prove_primality
= PROVE_PRIMALITY
;
704 /* Number of Miller-Rabin tests to run when not proving primality. */
708 factor_insert_refind (struct factors
*factors
, uintmax_t p
, unsigned int i
,
711 for (unsigned int j
= 0; j
< off
; j
++)
712 p
+= primes_diff
[i
+ j
];
713 factor_insert (factors
, p
);
716 /* Trial division with odd primes uses the following trick.
718 Let p be an odd prime, and B = 2^{W_TYPE_SIZE}. For simplicity,
719 consider the case t < B (this is the second loop below).
721 From our tables we get
723 binv = p^{-1} (mod B)
724 lim = floor ((B-1) / p).
726 First assume that t is a multiple of p, t = q * p. Then 0 <= q <= lim
727 (and all quotients in this range occur for some t).
729 Then t = q * p is true also (mod B), and p is invertible we get
731 q = t * binv (mod B).
733 Next, assume that t is *not* divisible by p. Since multiplication
734 by binv (mod B) is a one-to-one mapping,
736 t * binv (mod B) > lim,
738 because all the smaller values are already taken.
740 This can be summed up by saying that the function
742 q(t) = binv * t (mod B)
744 is a permutation of the range 0 <= t < B, with the curious property
745 that it maps the multiples of p onto the range 0 <= q <= lim, in
746 order, and the non-multiples of p onto the range lim < q < B.
750 factor_using_division (uintmax_t *t1p
, uintmax_t t1
, uintmax_t t0
,
751 struct factors
*factors
)
759 count_trailing_zeros (cnt
, t1
);
766 count_trailing_zeros (cnt
, t0
);
767 rsh2 (t1
, t0
, t1
, t0
, cnt
);
770 factor_insert_multiplicity (factors
, 2, cnt
);
775 for (i
= 0; t1
> 0 && i
< PRIMES_PTAB_ENTRIES
; i
++)
779 uintmax_t q1
, q0
, hi
;
780 MAYBE_UNUSED
uintmax_t lo
;
782 q0
= t0
* primes_dtab
[i
].binv
;
783 umul_ppmm (hi
, lo
, q0
, p
);
787 q1
= hi
* primes_dtab
[i
].binv
;
788 if (LIKELY (q1
> primes_dtab
[i
].lim
))
791 factor_insert (factors
, p
);
793 p
+= primes_diff
[i
+ 1];
798 #define DIVBLOCK(I) \
802 q = t0 * pd[I].binv; \
803 if (LIKELY (q > pd[I].lim)) \
806 factor_insert_refind (factors, p, i + 1, I); \
810 for (; i
< PRIMES_PTAB_ENTRIES
; i
+= 8)
813 const struct primes_dtab
*pd
= &primes_dtab
[i
];
823 p
+= primes_diff8
[i
];
832 mp_factor_using_division (mpz_t t
, struct mp_factors
*factors
)
837 devmsg ("[trial division] ");
841 p
= mpz_scan1 (t
, 0);
842 mpz_fdiv_q_2exp (t
, t
, p
);
845 mp_factor_insert_ui (factors
, 2);
850 for (unsigned int i
= 1; i
<= PRIMES_PTAB_ENTRIES
;)
852 if (! mpz_divisible_ui_p (t
, p
))
854 p
+= primes_diff
[i
++];
855 if (mpz_cmp_ui (t
, p
* p
) < 0)
860 mpz_tdiv_q_ui (t
, t
, p
);
861 mp_factor_insert_ui (factors
, p
);
868 /* Entry i contains (2i+1)^(-1) mod 2^8. */
869 static const unsigned char binvert_table
[128] =
871 0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
872 0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
873 0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
874 0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
875 0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
876 0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
877 0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
878 0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
879 0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
880 0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
881 0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
882 0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
883 0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
884 0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
885 0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
886 0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
889 /* Compute n^(-1) mod B, using a Newton iteration. */
890 #define binv(inv,n) \
892 uintmax_t __n = (n); \
895 __inv = binvert_table[(__n / 2) & 0x7F]; /* 8 */ \
896 if (W_TYPE_SIZE > 8) __inv = 2 * __inv - __inv * __inv * __n; \
897 if (W_TYPE_SIZE > 16) __inv = 2 * __inv - __inv * __inv * __n; \
898 if (W_TYPE_SIZE > 32) __inv = 2 * __inv - __inv * __inv * __n; \
900 if (W_TYPE_SIZE > 64) \
902 int __invbits = 64; \
904 __inv = 2 * __inv - __inv * __inv * __n; \
906 } while (__invbits < W_TYPE_SIZE); \
912 /* q = u / d, assuming d|u. */
913 #define divexact_21(q1, q0, u1, u0, d) \
915 uintmax_t _di, _q0; \
921 MAYBE_UNUSED intmax_t _p0; \
922 umul_ppmm (_p1, _p0, _q0, d); \
923 (q1) = ((u1) - _p1) * _di; \
934 #define redcify(r_prim, r, n) \
936 MAYBE_UNUSED uintmax_t _redcify_q; \
937 udiv_qrnnd (_redcify_q, r_prim, r, 0, n); \
940 /* x B^2 (mod n). Requires x > 0, n1 < B/2. */
941 #define redcify2(r1, r0, x, n1, n0) \
943 uintmax_t _r1, _r0, _i; \
946 _r1 = (x); _r0 = 0; \
951 _r1 = 0; _r0 = (x); \
952 _i = 2 * W_TYPE_SIZE; \
956 lsh2 (_r1, _r0, _r1, _r0, 1); \
957 if (ge2 (_r1, _r0, (n1), (n0))) \
958 sub_ddmmss (_r1, _r0, _r1, _r0, (n1), (n0)); \
964 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
965 Both a and b must be in redc form, the result will be in redc form too. */
966 static inline uintmax_t
967 mulredc (uintmax_t a
, uintmax_t b
, uintmax_t m
, uintmax_t mi
)
969 uintmax_t rh
, rl
, q
, th
, xh
;
970 MAYBE_UNUSED
uintmax_t tl
;
972 umul_ppmm (rh
, rl
, a
, b
);
974 umul_ppmm (th
, tl
, q
, m
);
982 /* Modular two-word multiplication, r = a * b mod m, with mi = m^(-1) mod B.
983 Both a and b must be in redc form, the result will be in redc form too.
984 For performance reasons, the most significant bit of m must be clear. */
986 mulredc2 (uintmax_t *r1p
,
987 uintmax_t a1
, uintmax_t a0
, uintmax_t b1
, uintmax_t b0
,
988 uintmax_t m1
, uintmax_t m0
, uintmax_t mi
)
990 uintmax_t r1
, r0
, q
, p1
, t1
, t0
, s1
, s0
;
991 MAYBE_UNUSED
uintmax_t p0
;
993 assert ((a1
>> (W_TYPE_SIZE
- 1)) == 0);
994 assert ((b1
>> (W_TYPE_SIZE
- 1)) == 0);
995 assert ((m1
>> (W_TYPE_SIZE
- 1)) == 0);
997 /* First compute a0 * <b1, b0> B^{-1}
1010 umul_ppmm (t1
, t0
, a0
, b0
);
1011 umul_ppmm (r1
, r0
, a0
, b1
);
1013 umul_ppmm (p1
, p0
, q
, m0
);
1014 umul_ppmm (s1
, s0
, q
, m1
);
1015 r0
+= (t0
!= 0); /* Carry */
1016 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1017 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1018 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1020 /* Next, (a1 * <b1, b0> + <r1, r0> B^{-1}
1035 umul_ppmm (t1
, t0
, a1
, b0
);
1036 umul_ppmm (s1
, s0
, a1
, b1
);
1037 add_ssaaaa (t1
, t0
, t1
, t0
, 0, r0
);
1039 add_ssaaaa (r1
, r0
, s1
, s0
, 0, r1
);
1040 umul_ppmm (p1
, p0
, q
, m0
);
1041 umul_ppmm (s1
, s0
, q
, m1
);
1042 r0
+= (t0
!= 0); /* Carry */
1043 add_ssaaaa (r1
, r0
, r1
, r0
, 0, p1
);
1044 add_ssaaaa (r1
, r0
, r1
, r0
, 0, t1
);
1045 add_ssaaaa (r1
, r0
, r1
, r0
, s1
, s0
);
1047 if (ge2 (r1
, r0
, m1
, m0
))
1048 sub_ddmmss (r1
, r0
, r1
, r0
, m1
, m0
);
1056 powm (uintmax_t b
, uintmax_t e
, uintmax_t n
, uintmax_t ni
, uintmax_t one
)
1065 b
= mulredc (b
, b
, n
, ni
);
1069 y
= mulredc (y
, b
, n
, ni
);
1076 powm2 (uintmax_t *r1m
,
1077 const uintmax_t *bp
, const uintmax_t *ep
, const uintmax_t *np
,
1078 uintmax_t ni
, const uintmax_t *one
)
1080 uintmax_t r1
, r0
, b1
, b0
, n1
, n0
;
1092 for (e
= ep
[0], i
= W_TYPE_SIZE
; i
> 0; i
--, e
>>= 1)
1096 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1099 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1102 for (e
= ep
[1]; e
> 0; e
>>= 1)
1106 r0
= mulredc2 (r1m
, r1
, r0
, b1
, b0
, n1
, n0
, ni
);
1109 b0
= mulredc2 (r1m
, b1
, b0
, b1
, b0
, n1
, n0
, ni
);
1118 millerrabin (uintmax_t n
, uintmax_t ni
, uintmax_t b
, uintmax_t q
,
1119 unsigned int k
, uintmax_t one
)
1121 uintmax_t y
= powm (b
, q
, n
, ni
, one
);
1123 uintmax_t nm1
= n
- one
; /* -1, but in redc representation. */
1125 if (y
== one
|| y
== nm1
)
1128 for (unsigned int i
= 1; i
< k
; i
++)
1130 y
= mulredc (y
, y
, n
, ni
);
1140 ATTRIBUTE_PURE
static bool
1141 millerrabin2 (const uintmax_t *np
, uintmax_t ni
, const uintmax_t *bp
,
1142 const uintmax_t *qp
, unsigned int k
, const uintmax_t *one
)
1144 uintmax_t y1
, y0
, nm1_1
, nm1_0
, r1m
;
1146 y0
= powm2 (&r1m
, bp
, qp
, np
, ni
, one
);
1149 if (y0
== one
[0] && y1
== one
[1])
1152 sub_ddmmss (nm1_1
, nm1_0
, np
[1], np
[0], one
[1], one
[0]);
1154 if (y0
== nm1_0
&& y1
== nm1_1
)
1157 for (unsigned int i
= 1; i
< k
; i
++)
1159 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, np
[1], np
[0], ni
);
1162 if (y0
== nm1_0
&& y1
== nm1_1
)
1164 if (y0
== one
[0] && y1
== one
[1])
1171 mp_millerrabin (mpz_srcptr n
, mpz_srcptr nm1
, mpz_ptr x
, mpz_ptr y
,
1172 mpz_srcptr q
, unsigned long int k
)
1174 mpz_powm (y
, x
, q
, n
);
1176 if (mpz_cmp_ui (y
, 1) == 0 || mpz_cmp (y
, nm1
) == 0)
1179 for (unsigned long int i
= 1; i
< k
; i
++)
1181 mpz_powm_ui (y
, y
, 2, n
);
1182 if (mpz_cmp (y
, nm1
) == 0)
1184 if (mpz_cmp_ui (y
, 1) == 0)
1190 /* Lucas' prime test. The number of iterations vary greatly, up to a few dozen
1191 have been observed. The average seem to be about 2. */
1193 prime_p (uintmax_t n
)
1197 uintmax_t a_prim
, one
, ni
;
1198 struct factors factors
;
1203 /* We have already casted out small primes. */
1204 if (n
< (uintmax_t) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
)
1207 /* Precomputation for Miller-Rabin. */
1208 uintmax_t q
= n
- 1;
1209 for (k
= 0; (q
& 1) == 0; k
++)
1213 binv (ni
, n
); /* ni <- 1/n mod B */
1214 redcify (one
, 1, n
);
1215 addmod (a_prim
, one
, one
, n
); /* i.e., redcify a = 2 */
1217 /* Perform a Miller-Rabin test, finds most composites quickly. */
1218 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1221 if (flag_prove_primality
)
1223 /* Factor n-1 for Lucas. */
1224 factor (0, n
- 1, &factors
);
1227 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1228 number composite. */
1229 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1231 if (flag_prove_primality
)
1234 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1237 = powm (a_prim
, (n
- 1) / factors
.p
[i
], n
, ni
, one
) != one
;
1242 /* After enough Miller-Rabin runs, be content. */
1243 is_prime
= (r
== MR_REPS
- 1);
1249 a
+= primes_diff
[r
]; /* Establish new base. */
1251 /* The following is equivalent to redcify (a_prim, a, n). It runs faster
1252 on most processors, since it avoids udiv_qrnnd. If we go down the
1253 udiv_qrnnd_preinv path, this code should be replaced. */
1256 umul_ppmm (s1
, s0
, one
, a
);
1257 if (LIKELY (s1
== 0))
1261 MAYBE_UNUSED
uintmax_t dummy
;
1262 udiv_qrnnd (dummy
, a_prim
, s1
, s0
, n
);
1266 if (!millerrabin (n
, ni
, a_prim
, q
, k
, one
))
1270 error (0, 0, _("Lucas prime test failure. This should not happen"));
1275 prime2_p (uintmax_t n1
, uintmax_t n0
)
1277 uintmax_t q
[2], nm1
[2];
1278 uintmax_t a_prim
[2];
1283 struct factors factors
;
1286 return prime_p (n0
);
1288 nm1
[1] = n1
- (n0
== 0);
1292 count_trailing_zeros (k
, nm1
[1]);
1300 count_trailing_zeros (k
, nm1
[0]);
1301 rsh2 (q
[1], q
[0], nm1
[1], nm1
[0], k
);
1306 redcify2 (one
[1], one
[0], 1, n1
, n0
);
1307 addmod2 (a_prim
[1], a_prim
[0], one
[1], one
[0], one
[1], one
[0], n1
, n0
);
1309 /* FIXME: Use scalars or pointers in arguments? Some consistency needed. */
1313 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1316 if (flag_prove_primality
)
1318 /* Factor n-1 for Lucas. */
1319 factor (nm1
[1], nm1
[0], &factors
);
1322 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1323 number composite. */
1324 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1327 uintmax_t e
[2], y
[2];
1329 if (flag_prove_primality
)
1332 if (factors
.plarge
[1])
1335 binv (pi
, factors
.plarge
[0]);
1338 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1339 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1341 for (unsigned int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1343 /* FIXME: We always have the factor 2. Do we really need to
1344 handle it here? We have done the same powering as part
1346 if (factors
.p
[i
] == 2)
1347 rsh2 (e
[1], e
[0], nm1
[1], nm1
[0], 1);
1349 divexact_21 (e
[1], e
[0], nm1
[1], nm1
[0], factors
.p
[i
]);
1350 y
[0] = powm2 (&y
[1], a_prim
, e
, na
, ni
, one
);
1351 is_prime
= (y
[0] != one
[0] || y
[1] != one
[1]);
1356 /* After enough Miller-Rabin runs, be content. */
1357 is_prime
= (r
== MR_REPS
- 1);
1363 a
+= primes_diff
[r
]; /* Establish new base. */
1364 redcify2 (a_prim
[1], a_prim
[0], a
, n1
, n0
);
1366 if (!millerrabin2 (na
, ni
, a_prim
, q
, k
, one
))
1370 error (0, 0, _("Lucas prime test failure. This should not happen"));
1375 mp_prime_p (mpz_t n
)
1378 mpz_t q
, a
, nm1
, tmp
;
1379 struct mp_factors factors
;
1381 if (mpz_cmp_ui (n
, 1) <= 0)
1384 /* We have already casted out small primes. */
1385 if (mpz_cmp_ui (n
, (long) FIRST_OMITTED_PRIME
* FIRST_OMITTED_PRIME
) < 0)
1388 mpz_inits (q
, a
, nm1
, tmp
, NULL
);
1390 /* Precomputation for Miller-Rabin. */
1391 mpz_sub_ui (nm1
, n
, 1);
1393 /* Find q and k, where q is odd and n = 1 + 2**k * q. */
1394 unsigned long int k
= mpz_scan1 (nm1
, 0);
1395 mpz_tdiv_q_2exp (q
, nm1
, k
);
1399 /* Perform a Miller-Rabin test, finds most composites quickly. */
1400 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1406 if (flag_prove_primality
)
1408 /* Factor n-1 for Lucas. */
1410 mp_factor (tmp
, &factors
);
1413 /* Loop until Lucas proves our number prime, or Miller-Rabin proves our
1414 number composite. */
1415 for (unsigned int r
= 0; r
< PRIMES_PTAB_ENTRIES
; r
++)
1417 if (flag_prove_primality
)
1420 for (unsigned long int i
= 0; i
< factors
.nfactors
&& is_prime
; i
++)
1422 mpz_divexact (tmp
, nm1
, factors
.p
[i
]);
1423 mpz_powm (tmp
, a
, tmp
, n
);
1424 is_prime
= mpz_cmp_ui (tmp
, 1) != 0;
1429 /* After enough Miller-Rabin runs, be content. */
1430 is_prime
= (r
== MR_REPS
- 1);
1436 mpz_add_ui (a
, a
, primes_diff
[r
]); /* Establish new base. */
1438 if (!mp_millerrabin (n
, nm1
, a
, tmp
, q
, k
))
1445 error (0, 0, _("Lucas prime test failure. This should not happen"));
1449 if (flag_prove_primality
)
1450 mp_factor_clear (&factors
);
1452 mpz_clears (q
, a
, nm1
, tmp
, NULL
);
1458 factor_using_pollard_rho (uintmax_t n
, unsigned long int a
,
1459 struct factors
*factors
)
1461 uintmax_t x
, z
, y
, P
, t
, ni
, g
;
1463 unsigned long int k
= 1;
1464 unsigned long int l
= 1;
1467 addmod (x
, P
, P
, n
); /* i.e., redcify(2) */
1474 binv (ni
, n
); /* FIXME: when could we use old 'ni' value? */
1480 x
= mulredc (x
, x
, n
, ni
);
1481 addmod (x
, x
, a
, n
);
1483 submod (t
, z
, x
, n
);
1484 P
= mulredc (P
, t
, n
, ni
);
1488 if (gcd_odd (P
, n
) != 1)
1498 for (unsigned long int i
= 0; i
< k
; i
++)
1500 x
= mulredc (x
, x
, n
, ni
);
1501 addmod (x
, x
, a
, n
);
1509 y
= mulredc (y
, y
, n
, ni
);
1510 addmod (y
, y
, a
, n
);
1512 submod (t
, z
, y
, n
);
1519 /* Found n itself as factor. Restart with different params. */
1520 factor_using_pollard_rho (n
, a
+ 1, factors
);
1527 factor_using_pollard_rho (g
, a
+ 1, factors
);
1529 factor_insert (factors
, g
);
1533 factor_insert (factors
, n
);
1544 factor_using_pollard_rho2 (uintmax_t n1
, uintmax_t n0
, unsigned long int a
,
1545 struct factors
*factors
)
1547 uintmax_t x1
, x0
, z1
, z0
, y1
, y0
, P1
, P0
, t1
, t0
, ni
, g1
, g0
, r1m
;
1549 unsigned long int k
= 1;
1550 unsigned long int l
= 1;
1552 redcify2 (P1
, P0
, 1, n1
, n0
);
1553 addmod2 (x1
, x0
, P1
, P0
, P1
, P0
, n1
, n0
); /* i.e., redcify(2) */
1557 while (n1
!= 0 || n0
!= 1)
1565 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1567 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1569 submod2 (t1
, t0
, z1
, z0
, x1
, x0
, n1
, n0
);
1570 P0
= mulredc2 (&r1m
, P1
, P0
, t1
, t0
, n1
, n0
, ni
);
1575 g0
= gcd2_odd (&g1
, P1
, P0
, n1
, n0
);
1576 if (g1
!= 0 || g0
!= 1)
1586 for (unsigned long int i
= 0; i
< k
; i
++)
1588 x0
= mulredc2 (&r1m
, x1
, x0
, x1
, x0
, n1
, n0
, ni
);
1590 addmod2 (x1
, x0
, x1
, x0
, 0, (uintmax_t) a
, n1
, n0
);
1598 y0
= mulredc2 (&r1m
, y1
, y0
, y1
, y0
, n1
, n0
, ni
);
1600 addmod2 (y1
, y0
, y1
, y0
, 0, (uintmax_t) a
, n1
, n0
);
1602 submod2 (t1
, t0
, z1
, z0
, y1
, y0
, n1
, n0
);
1603 g0
= gcd2_odd (&g1
, t1
, t0
, n1
, n0
);
1605 while (g1
== 0 && g0
== 1);
1609 /* The found factor is one word, and > 1. */
1610 divexact_21 (n1
, n0
, n1
, n0
, g0
); /* n = n / g */
1613 factor_using_pollard_rho (g0
, a
+ 1, factors
);
1615 factor_insert (factors
, g0
);
1619 /* The found factor is two words. This is highly unlikely, thus hard
1620 to trigger. Please be careful before you change this code! */
1623 if (n1
== g1
&& n0
== g0
)
1625 /* Found n itself as factor. Restart with different params. */
1626 factor_using_pollard_rho2 (n1
, n0
, a
+ 1, factors
);
1630 /* Compute n = n / g. Since the result will fit one word,
1631 we can compute the quotient modulo B, ignoring the high
1637 if (!prime2_p (g1
, g0
))
1638 factor_using_pollard_rho2 (g1
, g0
, a
+ 1, factors
);
1640 factor_insert_large (factors
, g1
, g0
);
1647 factor_insert (factors
, n0
);
1651 factor_using_pollard_rho (n0
, a
, factors
);
1655 if (prime2_p (n1
, n0
))
1657 factor_insert_large (factors
, n1
, n0
);
1661 x0
= mod2 (&x1
, x1
, x0
, n1
, n0
);
1662 z0
= mod2 (&z1
, z1
, z0
, n1
, n0
);
1663 y0
= mod2 (&y1
, y1
, y0
, n1
, n0
);
1668 mp_factor_using_pollard_rho (mpz_t n
, unsigned long int a
,
1669 struct mp_factors
*factors
)
1674 devmsg ("[pollard-rho (%lu)] ", a
);
1676 mpz_inits (t
, t2
, NULL
);
1677 mpz_init_set_si (y
, 2);
1678 mpz_init_set_si (x
, 2);
1679 mpz_init_set_si (z
, 2);
1680 mpz_init_set_ui (P
, 1);
1682 unsigned long long int k
= 1;
1683 unsigned long long int l
= 1;
1685 while (mpz_cmp_ui (n
, 1) != 0)
1693 mpz_add_ui (x
, x
, a
);
1702 if (mpz_cmp_ui (t
, 1) != 0)
1712 for (unsigned long long int i
= 0; i
< k
; i
++)
1716 mpz_add_ui (x
, x
, a
);
1726 mpz_add_ui (y
, y
, a
);
1731 while (mpz_cmp_ui (t
, 1) == 0);
1733 mpz_divexact (n
, n
, t
); /* divide by t, before t is overwritten */
1735 if (!mp_prime_p (t
))
1737 devmsg ("[composite factor--restarting pollard-rho] ");
1738 mp_factor_using_pollard_rho (t
, a
+ 1, factors
);
1742 mp_factor_insert (factors
, t
);
1747 mp_factor_insert (factors
, n
);
1756 mpz_clears (P
, t2
, t
, z
, x
, y
, NULL
);
1760 /* FIXME: Maybe better to use an iteration converging to 1/sqrt(n)? If
1761 algorithm is replaced, consider also returning the remainder. */
1771 count_leading_zeros (c
, n
);
1773 /* Make x > sqrt(n). This will be invariant through the loop. */
1774 x
= (uintmax_t) 1 << ((W_TYPE_SIZE
+ 1 - c
) / 2);
1778 uintmax_t y
= (x
+ n
/ x
) / 2;
1788 isqrt2 (uintmax_t nh
, uintmax_t nl
)
1793 /* Ensures the remainder fits in an uintmax_t. */
1794 assert (nh
< ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)));
1799 count_leading_zeros (shift
, nh
);
1802 /* Make x > sqrt (n). */
1803 x
= isqrt ((nh
<< shift
) + (nl
>> (W_TYPE_SIZE
- shift
))) + 1;
1804 x
<<= (W_TYPE_SIZE
- shift
) / 2;
1806 /* Do we need more than one iteration? */
1809 MAYBE_UNUSED
uintmax_t r
;
1811 udiv_qrnnd (q
, r
, nh
, nl
, x
);
1817 umul_ppmm (hi
, lo
, x
+ 1, x
+ 1);
1818 assert (gt2 (hi
, lo
, nh
, nl
));
1820 umul_ppmm (hi
, lo
, x
, x
);
1821 assert (ge2 (nh
, nl
, hi
, lo
));
1822 sub_ddmmss (hi
, lo
, nh
, nl
, hi
, lo
);
1832 /* MAGIC[N] has a bit i set iff i is a quadratic residue mod N. */
1833 # define MAGIC64 0x0202021202030213ULL
1834 # define MAGIC63 0x0402483012450293ULL
1835 # define MAGIC65 0x218a019866014613ULL
1836 # define MAGIC11 0x23b
1838 /* Return the square root if the input is a square, otherwise 0. */
1841 is_square (uintmax_t x
)
1843 /* Uses the tests suggested by Cohen. Excludes 99% of the non-squares before
1844 computing the square root. */
1845 if (((MAGIC64
>> (x
& 63)) & 1)
1846 && ((MAGIC63
>> (x
% 63)) & 1)
1847 /* Both 0 and 64 are squares mod (65). */
1848 && ((MAGIC65
>> ((x
% 65) & 63)) & 1)
1849 && ((MAGIC11
>> (x
% 11) & 1)))
1851 uintmax_t r
= isqrt (x
);
1858 /* invtab[i] = floor (0x10000 / (0x100 + i) */
1859 static const unsigned short invtab
[0x81] =
1862 0x1fc, 0x1f8, 0x1f4, 0x1f0, 0x1ec, 0x1e9, 0x1e5, 0x1e1,
1863 0x1de, 0x1da, 0x1d7, 0x1d4, 0x1d0, 0x1cd, 0x1ca, 0x1c7,
1864 0x1c3, 0x1c0, 0x1bd, 0x1ba, 0x1b7, 0x1b4, 0x1b2, 0x1af,
1865 0x1ac, 0x1a9, 0x1a6, 0x1a4, 0x1a1, 0x19e, 0x19c, 0x199,
1866 0x197, 0x194, 0x192, 0x18f, 0x18d, 0x18a, 0x188, 0x186,
1867 0x183, 0x181, 0x17f, 0x17d, 0x17a, 0x178, 0x176, 0x174,
1868 0x172, 0x170, 0x16e, 0x16c, 0x16a, 0x168, 0x166, 0x164,
1869 0x162, 0x160, 0x15e, 0x15c, 0x15a, 0x158, 0x157, 0x155,
1870 0x153, 0x151, 0x150, 0x14e, 0x14c, 0x14a, 0x149, 0x147,
1871 0x146, 0x144, 0x142, 0x141, 0x13f, 0x13e, 0x13c, 0x13b,
1872 0x139, 0x138, 0x136, 0x135, 0x133, 0x132, 0x130, 0x12f,
1873 0x12e, 0x12c, 0x12b, 0x129, 0x128, 0x127, 0x125, 0x124,
1874 0x123, 0x121, 0x120, 0x11f, 0x11e, 0x11c, 0x11b, 0x11a,
1875 0x119, 0x118, 0x116, 0x115, 0x114, 0x113, 0x112, 0x111,
1876 0x10f, 0x10e, 0x10d, 0x10c, 0x10b, 0x10a, 0x109, 0x108,
1877 0x107, 0x106, 0x105, 0x104, 0x103, 0x102, 0x101, 0x100,
1880 /* Compute q = [u/d], r = u mod d. Avoids slow hardware division for the case
1881 that q < 0x40; here it instead uses a table of (Euclidian) inverses. */
1882 # define div_smallq(q, r, u, d) \
1884 if ((u) / 0x40 < (d)) \
1887 uintmax_t _dinv, _mask, _q, _r; \
1888 count_leading_zeros (_cnt, (d)); \
1890 if (UNLIKELY (_cnt > (W_TYPE_SIZE - 8))) \
1892 _dinv = invtab[((d) << (_cnt + 8 - W_TYPE_SIZE)) - 0x80]; \
1893 _q = _dinv * _r >> (8 + W_TYPE_SIZE - _cnt); \
1897 _dinv = invtab[((d) >> (W_TYPE_SIZE - 8 - _cnt)) - 0x7f]; \
1898 _q = _dinv * (_r >> (W_TYPE_SIZE - 3 - _cnt)) >> 11; \
1902 _mask = -(uintmax_t) (_r >= (d)); \
1903 (r) = _r - (_mask & (d)); \
1905 assert ((q) * (d) + (r) == u); \
1909 uintmax_t _q = (u) / (d); \
1910 (r) = (u) - _q * (d); \
1915 /* Notes: Example N = 22117019. After first phase we find Q1 = 6314, Q
1916 = 3025, P = 1737, representing F_{18} = (-6314, 2 * 1737, 3025),
1919 Constructing the square root, we get Q1 = 55, Q = 8653, P = 4652,
1920 representing G_0 = (-55, 2 * 4652, 8653).
1922 In the notation of the paper:
1924 S_{-1} = 55, S_0 = 8653, R_0 = 4652
1928 t_0 = floor([q_0 + R_0] / S0) = 1
1929 R_1 = t_0 * S_0 - R_0 = 4001
1930 S_1 = S_{-1} +t_0 (R_0 - R_1) = 706
1933 /* Multipliers, in order of efficiency:
1934 0.7268 3*5*7*11 = 1155 = 3 (mod 4)
1935 0.7317 3*5*7 = 105 = 1
1936 0.7820 3*5*11 = 165 = 1
1938 0.8101 3*7*11 = 231 = 3
1940 0.8284 5*7*11 = 385 = 1
1942 0.8716 3*11 = 33 = 1
1944 0.8913 5*11 = 55 = 3
1946 0.9233 7*11 = 77 = 1
1950 # define QUEUE_SIZE 50
1954 # define Q_FREQ_SIZE 50
1955 /* Element 0 keeps the total */
1956 static unsigned int q_freq
[Q_FREQ_SIZE
+ 1];
1960 /* Return true on success. Expected to fail only for numbers
1961 >= 2^{2*W_TYPE_SIZE - 2}, or close to that limit. */
1963 factor_using_squfof (uintmax_t n1
, uintmax_t n0
, struct factors
*factors
)
1965 /* Uses algorithm and notation from
1967 SQUARE FORM FACTORIZATION
1968 JASON E. GOWER AND SAMUEL S. WAGSTAFF, JR.
1970 https://homes.cerias.purdue.edu/~ssw/squfof.pdf
1973 static const unsigned int multipliers_1
[] =
1975 105, 165, 21, 385, 33, 5, 77, 1, 0
1977 static const unsigned int multipliers_3
[] =
1979 1155, 15, 231, 35, 3, 55, 7, 11, 0
1982 const unsigned int *m
;
1984 struct { uintmax_t Q
; uintmax_t P
; } queue
[QUEUE_SIZE
];
1986 if (n1
>= ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)))
1989 uintmax_t sqrt_n
= isqrt2 (n1
, n0
);
1991 if (n0
== sqrt_n
* sqrt_n
)
1995 umul_ppmm (p1
, p0
, sqrt_n
, sqrt_n
);
2000 if (prime_p (sqrt_n
))
2001 factor_insert_multiplicity (factors
, sqrt_n
, 2);
2007 if (!factor_using_squfof (0, sqrt_n
, &f
))
2009 /* Try pollard rho instead */
2010 factor_using_pollard_rho (sqrt_n
, 1, &f
);
2012 /* Duplicate the new factors */
2013 for (unsigned int i
= 0; i
< f
.nfactors
; i
++)
2014 factor_insert_multiplicity (factors
, f
.p
[i
], 2 * f
.e
[i
]);
2020 /* Select multipliers so we always get n * mu = 3 (mod 4) */
2021 for (m
= (n0
% 4 == 1) ? multipliers_3
: multipliers_1
;
2024 uintmax_t S
, Dh
, Dl
, Q1
, Q
, P
, L
, L1
, B
;
2026 unsigned int mu
= *m
;
2027 unsigned int qpos
= 0;
2029 assert (mu
* n0
% 4 == 3);
2031 /* In the notation of the paper, with mu * n == 3 (mod 4), we
2032 get \Delta = 4 mu * n, and the paper's \mu is 2 mu. As far as
2033 I understand it, the necessary bound is 4 \mu^3 < n, or 32
2036 However, this seems insufficient: With n = 37243139 and mu =
2037 105, we get a trivial factor, from the square 38809 = 197^2,
2038 without any corresponding Q earlier in the iteration.
2040 Requiring 64 mu^3 < n seems sufficient. */
2043 if ((uintmax_t) mu
* mu
* mu
>= n0
/ 64)
2048 if (n1
> ((uintmax_t) 1 << (W_TYPE_SIZE
- 2)) / mu
)
2051 umul_ppmm (Dh
, Dl
, n0
, mu
);
2054 assert (Dl
% 4 != 1);
2055 assert (Dh
< (uintmax_t) 1 << (W_TYPE_SIZE
- 2));
2057 S
= isqrt2 (Dh
, Dl
);
2062 /* Square root remainder fits in one word, so ignore high part. */
2064 /* FIXME: When can this differ from floor (sqrt (2 * sqrt (D)))? */
2069 /* The form is (+/- Q1, 2P, -/+ Q), of discriminant 4 (P^2 + Q Q1) =
2072 for (i
= 0; i
<= B
; i
++)
2074 uintmax_t q
, P1
, t
, rem
;
2076 div_smallq (q
, rem
, S
+ P
, Q
);
2077 P1
= S
- rem
; /* P1 = q*Q - P */
2079 IF_LINT (assert (q
> 0 && Q
> 0));
2083 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2093 g
/= gcd_odd (g
, mu
);
2097 if (qpos
>= QUEUE_SIZE
)
2098 die (EXIT_FAILURE
, 0, _("squfof queue overflow"));
2100 queue
[qpos
].P
= P
% g
;
2105 /* I think the difference can be either sign, but mod
2106 2^W_TYPE_SIZE arithmetic should be fine. */
2107 t
= Q1
+ q
* (P
- P1
);
2114 uintmax_t r
= is_square (Q
);
2117 for (unsigned int j
= 0; j
< qpos
; j
++)
2119 if (queue
[j
].Q
== r
)
2122 /* Traversed entire cycle. */
2123 goto next_multiplier
;
2125 /* Need the absolute value for divisibility test. */
2126 if (P
>= queue
[j
].P
)
2132 /* Delete entries up to and including entry
2133 j, which matched. */
2134 memmove (queue
, queue
+ j
+ 1,
2135 (qpos
- j
- 1) * sizeof (queue
[0]));
2142 /* We have found a square form, which should give a
2145 assert (S
>= P
); /* What signs are possible? */
2146 P
+= r
* ((S
- P
) / r
);
2148 /* Note: Paper says (N - P*P) / Q1, that seems incorrect
2149 for the case D = 2N. */
2150 /* Compute Q = (D - P*P) / Q1, but we need double
2153 umul_ppmm (hi
, lo
, P
, P
);
2154 sub_ddmmss (hi
, lo
, Dh
, Dl
, hi
, lo
);
2155 udiv_qrnnd (Q
, rem
, hi
, lo
, Q1
);
2160 /* Note: There appears to by a typo in the paper,
2161 Step 4a in the algorithm description says q <--
2162 floor([S+P]/\hat Q), but looking at the equations
2163 in Sec. 3.1, it should be q <-- floor([S+P] / Q).
2164 (In this code, \hat Q is Q1). */
2165 div_smallq (q
, rem
, S
+ P
, Q
);
2166 P1
= S
- rem
; /* P1 = q*Q - P */
2170 q_freq
[MIN (q
, Q_FREQ_SIZE
)]++;
2174 t
= Q1
+ q
* (P
- P1
);
2182 Q
/= gcd_odd (Q
, mu
);
2184 assert (Q
> 1 && (n1
|| Q
< n0
));
2187 factor_insert (factors
, Q
);
2188 else if (!factor_using_squfof (0, Q
, factors
))
2189 factor_using_pollard_rho (Q
, 2, factors
);
2191 divexact_21 (n1
, n0
, n1
, n0
, Q
);
2193 if (prime2_p (n1
, n0
))
2194 factor_insert_large (factors
, n1
, n0
);
2197 if (!factor_using_squfof (n1
, n0
, factors
))
2200 factor_using_pollard_rho (n0
, 1, factors
);
2202 factor_using_pollard_rho2 (n1
, n0
, 1, factors
);
2217 /* Compute the prime factors of the 128-bit number (T1,T0), and put the
2218 results in FACTORS. */
2220 factor (uintmax_t t1
, uintmax_t t0
, struct factors
*factors
)
2222 factors
->nfactors
= 0;
2223 factors
->plarge
[1] = 0;
2225 if (t1
== 0 && t0
< 2)
2228 t0
= factor_using_division (&t1
, t1
, t0
, factors
);
2230 if (t1
== 0 && t0
< 2)
2233 if (prime2_p (t1
, t0
))
2234 factor_insert_large (factors
, t1
, t0
);
2238 if (factor_using_squfof (t1
, t0
, factors
))
2243 factor_using_pollard_rho (t0
, 1, factors
);
2245 factor_using_pollard_rho2 (t1
, t0
, 1, factors
);
2249 /* Use Pollard-rho to compute the prime factors of
2250 arbitrary-precision T, and put the results in FACTORS. */
2252 mp_factor (mpz_t t
, struct mp_factors
*factors
)
2254 mp_factor_init (factors
);
2256 if (mpz_sgn (t
) != 0)
2258 mp_factor_using_division (t
, factors
);
2260 if (mpz_cmp_ui (t
, 1) != 0)
2262 devmsg ("[is number prime?] ");
2264 mp_factor_insert (factors
, t
);
2266 mp_factor_using_pollard_rho (t
, 1, factors
);
2272 strto2uintmax (uintmax_t *hip
, uintmax_t *lop
, char const *s
)
2274 unsigned int lo_carry
;
2275 uintmax_t hi
= 0, lo
= 0;
2277 strtol_error err
= LONGINT_INVALID
;
2279 /* Initial scan for invalid digits. */
2283 unsigned int c
= *p
++;
2287 if (UNLIKELY (!ISDIGIT (c
)))
2289 err
= LONGINT_INVALID
;
2293 err
= LONGINT_OK
; /* we've seen at least one valid digit */
2296 while (err
== LONGINT_OK
)
2298 unsigned int c
= *s
++;
2304 if (UNLIKELY (hi
> ~(uintmax_t)0 / 10))
2306 err
= LONGINT_OVERFLOW
;
2311 lo_carry
= (lo
>> (W_TYPE_SIZE
- 3)) + (lo
>> (W_TYPE_SIZE
- 1));
2312 lo_carry
+= 10 * lo
< 2 * lo
;
2319 if (UNLIKELY (hi
< lo_carry
))
2321 err
= LONGINT_OVERFLOW
;
2332 /* Structure and routines for buffering and outputting full lines,
2333 to support parallel operation efficiently. */
2340 /* 512 is chosen to give good performance,
2341 and also is the max guaranteed size that
2342 consumers can read atomically through pipes.
2343 Also it's big enough to cater for max line length
2344 even with 128 bit uintmax_t. */
2345 #define FACTOR_PIPE_BUF 512
2353 /* Double to ensure enough space for
2354 previous numbers + next number. */
2355 lbuf
.buf
= xmalloc (FACTOR_PIPE_BUF
* 2);
2356 lbuf
.end
= lbuf
.buf
;
2359 /* Write complete LBUF to standard output. */
2363 size_t size
= lbuf
.end
- lbuf
.buf
;
2364 if (full_write (STDOUT_FILENO
, lbuf
.buf
, size
) != size
)
2365 die (EXIT_FAILURE
, errno
, "%s", _("write error"));
2366 lbuf
.end
= lbuf
.buf
;
2369 /* Add a character C to LBUF and if it's a newline
2370 and enough bytes are already buffered,
2371 then write atomically to standard output. */
2379 size_t buffered
= lbuf
.end
- lbuf
.buf
;
2381 /* Provide immediate output for interactive use. */
2382 static int line_buffered
= -1;
2383 if (line_buffered
== -1)
2384 line_buffered
= isatty (STDIN_FILENO
) || isatty (STDOUT_FILENO
);
2387 else if (buffered
>= FACTOR_PIPE_BUF
)
2389 /* Write output in <= PIPE_BUF chunks
2390 so consumers can read atomically. */
2391 char const *tend
= lbuf
.end
;
2393 /* Since a umaxint_t's factors must fit in 512
2394 we're guaranteed to find a newline here. */
2395 char *tlend
= lbuf
.buf
+ FACTOR_PIPE_BUF
;
2396 while (*--tlend
!= '\n');
2402 /* Buffer the remainder. */
2403 memcpy (lbuf
.buf
, tlend
, tend
- tlend
);
2404 lbuf
.end
= lbuf
.buf
+ (tend
- tlend
);
2409 /* Buffer an int to the internal LBUF. */
2411 lbuf_putint (uintmax_t i
, size_t min_width
)
2413 char buf
[INT_BUFSIZE_BOUND (uintmax_t)];
2414 char const *umaxstr
= umaxtostr (i
, buf
);
2415 size_t width
= sizeof (buf
) - (umaxstr
- buf
) - 1;
2418 for (; z
< min_width
; z
++)
2421 memcpy (lbuf
.end
, umaxstr
, width
);
2426 print_uintmaxes (uintmax_t t1
, uintmax_t t0
)
2431 lbuf_putint (t0
, 0);
2434 /* Use very plain code here since it seems hard to write fast code
2435 without assuming a specific word size. */
2436 q
= t1
/ 1000000000;
2437 r
= t1
% 1000000000;
2438 udiv_qrnnd (t0
, r
, r
, t0
, 1000000000);
2439 print_uintmaxes (q
, t0
);
2444 /* Single-precision factoring */
2446 print_factors_single (uintmax_t t1
, uintmax_t t0
)
2448 struct factors factors
;
2450 print_uintmaxes (t1
, t0
);
2453 factor (t1
, t0
, &factors
);
2455 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2456 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2459 print_uintmaxes (0, factors
.p
[j
]);
2462 if (factors
.plarge
[1])
2465 print_uintmaxes (factors
.plarge
[1], factors
.plarge
[0]);
2471 /* Emit the factors of the indicated number. If we have the option of using
2472 either algorithm, we select on the basis of the length of the number.
2473 For longer numbers, we prefer the MP algorithm even if the native algorithm
2474 has enough digits, because the algorithm is better. The turnover point
2475 depends on the value. */
2477 print_factors (char const *input
)
2479 /* Skip initial spaces and '+'. */
2480 char const *str
= input
;
2487 /* Try converting the number to one or two words. If it fails, use GMP or
2488 print an error message. The 2nd condition checks that the most
2489 significant bit of the two-word number is clear, in a typesize neutral
2491 strtol_error err
= strto2uintmax (&t1
, &t0
, str
);
2496 if (((t1
<< 1) >> 1) == t1
)
2498 devmsg ("[using single-precision arithmetic] ");
2499 print_factors_single (t1
, t0
);
2504 case LONGINT_OVERFLOW
:
2509 error (0, 0, _("%s is not a valid positive integer"), quote (input
));
2513 devmsg ("[using arbitrary-precision arithmetic] ");
2515 struct mp_factors factors
;
2517 mpz_init_set_str (t
, str
, 10);
2519 mpz_out_str (stdout
, 10, t
);
2521 mp_factor (t
, &factors
);
2523 for (unsigned int j
= 0; j
< factors
.nfactors
; j
++)
2524 for (unsigned int k
= 0; k
< factors
.e
[j
]; k
++)
2527 mpz_out_str (stdout
, 10, factors
.p
[j
]);
2530 mp_factor_clear (&factors
);
2540 if (status
!= EXIT_SUCCESS
)
2545 Usage: %s [NUMBER]...\n\
2548 program_name
, program_name
);
2550 Print the prime factors of each specified integer NUMBER. If none\n\
2551 are specified on the command line, read them from standard input.\n\
2554 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
2555 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
2556 emit_ancillary_info (PROGRAM_NAME
);
2565 token_buffer tokenbuffer
;
2567 init_tokenbuffer (&tokenbuffer
);
2571 size_t token_length
= readtoken (stdin
, DELIM
, sizeof (DELIM
) - 1,
2573 if (token_length
== (size_t) -1)
2575 ok
&= print_factors (tokenbuffer
.buffer
);
2577 free (tokenbuffer
.buffer
);
2583 main (int argc
, char **argv
)
2585 initialize_main (&argc
, &argv
);
2586 set_program_name (argv
[0]);
2587 setlocale (LC_ALL
, "");
2588 bindtextdomain (PACKAGE
, LOCALEDIR
);
2589 textdomain (PACKAGE
);
2592 atexit (close_stdout
);
2593 atexit (lbuf_flush
);
2596 while ((c
= getopt_long (argc
, argv
, "", long_options
, NULL
)) != -1)
2600 case DEV_DEBUG_OPTION
:
2604 case_GETOPT_HELP_CHAR
;
2606 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
2609 usage (EXIT_FAILURE
);
2614 memset (q_freq
, 0, sizeof (q_freq
));
2623 for (int i
= optind
; i
< argc
; i
++)
2624 if (! print_factors (argv
[i
]))
2632 printf ("q freq. cum. freq.(total: %d)\n", q_freq
[0]);
2633 for (unsigned int i
= 1, acc_f
= 0.0; i
<= Q_FREQ_SIZE
; i
++)
2635 double f
= (double) q_freq
[i
] / q_freq
[0];
2637 printf ("%s%d %.2f%% %.2f%%\n", i
== Q_FREQ_SIZE
? ">=" : "", i
,
2638 100.0 * f
, 100.0 * acc_f
);
2643 return ok
? EXIT_SUCCESS
: EXIT_FAILURE
;