1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
21 or both in parallel, as here.
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
36 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
37 the size is returned (if inputs are non-normalized, result may be
38 non-normalized too). Temporary space needed is M->n + n.
41 hgcd_mul_matrix_vector (struct hgcd_matrix
*M
,
42 mp_ptr rp
, mp_srcptr ap
, mp_ptr bp
, mp_size_t n
, mp_ptr tp
)
46 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
59 mpn_mul (tp
, M
->p
[0][0], M
->n
, ap
, n
);
60 mpn_mul (rp
, M
->p
[1][0], M
->n
, bp
, n
);
64 mpn_mul (tp
, ap
, n
, M
->p
[0][0], M
->n
);
65 mpn_mul (rp
, bp
, n
, M
->p
[1][0], M
->n
);
68 ah
= mpn_add_n (rp
, rp
, tp
, n
+ M
->n
);
72 mpn_mul (tp
, M
->p
[1][1], M
->n
, bp
, n
);
73 mpn_mul (bp
, M
->p
[0][1], M
->n
, ap
, n
);
77 mpn_mul (tp
, bp
, n
, M
->p
[1][1], M
->n
);
78 mpn_mul (bp
, ap
, n
, M
->p
[0][1], M
->n
);
80 bh
= mpn_add_n (bp
, bp
, tp
, n
+ M
->n
);
92 while ( (rp
[n
-1] | bp
[n
-1]) == 0)
99 #define COMPUTE_V_ITCH(n) (2*(n))
101 /* Computes |v| = |(g - u a)| / b, where u may be positive or
102 negative, and v is of the opposite sign. max(a, b) is of size n, u and
103 v at most size n, and v must have space for n+1 limbs. */
105 compute_v (mp_ptr vp
,
106 mp_srcptr ap
, mp_srcptr bp
, mp_size_t n
,
107 mp_srcptr gp
, mp_size_t gn
,
108 mp_srcptr up
, mp_size_t usize
,
122 ASSERT (up
[size
-1] > 0);
125 MPN_NORMALIZE (ap
, an
);
129 mpn_mul (tp
, ap
, an
, up
, size
);
131 mpn_mul (tp
, up
, size
, ap
, an
);
137 /* |v| = -v = (u a - g) / b */
139 ASSERT_NOCARRY (mpn_sub (tp
, tp
, size
, gp
, gn
));
140 MPN_NORMALIZE (tp
, size
);
145 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
146 (g + |u| a) always fits in (|usize| + an) limbs. */
148 ASSERT_NOCARRY (mpn_add (tp
, tp
, size
, gp
, gn
));
149 size
-= (tp
[size
- 1] == 0);
152 /* Now divide t / b. There must be no remainder */
154 MPN_NORMALIZE (bp
, bn
);
158 ASSERT (vn
<= n
+ 1);
160 mpn_divexact (vp
, tp
, size
, bp
, bn
);
161 vn
-= (vp
[vn
-1] == 0);
166 /* Temporary storage:
168 Initial division: Quotient of at most an - n + 1 <= an limbs.
170 Storage for u0 and u1: 2(n+1).
172 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
174 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
176 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
178 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
180 For the lehmer call after the loop, Let T denote
181 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
182 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
183 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
184 sufficient for both operations.
188 /* Optimal choice of p seems difficult. In each iteration the division
189 * of work between hgcd and the updates of u0 and u1 depends on the
190 * current size of the u. It may be desirable to use a different
191 * choice of p in each iteration. Also the input size seems to matter;
192 * choosing p = n / 3 in the first iteration seems to improve
193 * performance slightly for input size just above the threshold, but
194 * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
199 mpn_gcdext (mp_ptr gp
, mp_ptr up
, mp_size_t
*usizep
,
200 mp_ptr ap
, mp_size_t an
, mp_ptr bp
, mp_size_t n
)
204 mp_size_t matrix_scratch
;
205 mp_size_t ualloc
= n
+ 1;
207 struct gcdext_ctx ctx
;
218 ASSERT (bp
[n
-1] > 0);
222 /* FIXME: Check for small sizes first, before setting up temporary
224 talloc
= MPN_GCDEXT_LEHMER_N_ITCH(n
);
226 /* For initial division */
227 scratch
= an
- n
+ 1;
228 if (scratch
> talloc
)
231 if (ABOVE_THRESHOLD (n
, GCDEXT_DC_THRESHOLD
))
234 mp_size_t hgcd_scratch
;
235 mp_size_t update_scratch
;
236 mp_size_t p1
= CHOOSE_P_1 (n
);
237 mp_size_t p2
= CHOOSE_P_2 (n
);
238 mp_size_t min_p
= MIN(p1
, p2
);
239 mp_size_t max_p
= MAX(p1
, p2
);
240 matrix_scratch
= MPN_HGCD_MATRIX_INIT_ITCH (n
- min_p
);
241 hgcd_scratch
= mpn_hgcd_itch (n
- min_p
);
242 update_scratch
= max_p
+ n
- 1;
244 scratch
= matrix_scratch
+ MAX(hgcd_scratch
, update_scratch
);
245 if (scratch
> talloc
)
248 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
249 copies of a and b. */
250 scratch
= MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD
)
251 + 3*GCDEXT_DC_THRESHOLD
;
253 if (scratch
> talloc
)
256 /* Cofactors u0 and u1 */
260 tp
= TMP_ALLOC_LIMBS(talloc
);
264 mpn_tdiv_qr (tp
, ap
, 0, ap
, an
, bp
, n
);
266 if (mpn_zero_p (ap
, n
))
268 MPN_COPY (gp
, bp
, n
);
275 if (BELOW_THRESHOLD (n
, GCDEXT_DC_THRESHOLD
))
277 mp_size_t gn
= mpn_gcdext_lehmer_n(gp
, up
, usizep
, ap
, bp
, n
, tp
);
283 MPN_ZERO (tp
, 2*ualloc
);
284 u0
= tp
; tp
+= ualloc
;
285 u1
= tp
; tp
+= ualloc
;
292 /* For the first hgcd call, there are no u updates, and it makes
293 some sense to use a different choice for p. */
295 /* FIXME: We could trim use of temporary storage, since u0 and u1
296 are not used yet. For the hgcd call, we could swap in the u0
297 and u1 pointers for the relevant matrix elements. */
299 struct hgcd_matrix M
;
300 mp_size_t p
= CHOOSE_P_1 (n
);
303 mpn_hgcd_matrix_init (&M
, n
- p
, tp
);
304 nn
= mpn_hgcd (ap
+ p
, bp
+ p
, n
- p
, &M
, tp
+ matrix_scratch
);
307 ASSERT (M
.n
<= (n
- p
- 1)/2);
308 ASSERT (M
.n
+ p
<= (p
+ n
- 1) / 2);
310 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
311 n
= mpn_hgcd_matrix_adjust (&M
, p
+ nn
, ap
, bp
, p
, tp
+ matrix_scratch
);
313 MPN_COPY (u0
, M
.p
[1][0], M
.n
);
314 MPN_COPY (u1
, M
.p
[1][1], M
.n
);
316 while ( (u0
[un
-1] | u1
[un
-1] ) == 0)
321 /* mpn_hgcd has failed. Then either one of a or b is very
322 small, or the difference is very small. Perform one
323 subtraction followed by one division. */
328 ctx
.tp
= tp
+ n
; /* ualloc */
331 /* Temporary storage n */
332 n
= mpn_gcd_subdiv_step (ap
, bp
, n
, 0, mpn_gcdext_hook
, &ctx
, tp
);
340 ASSERT (un
< ualloc
);
344 while (ABOVE_THRESHOLD (n
, GCDEXT_DC_THRESHOLD
))
346 struct hgcd_matrix M
;
347 mp_size_t p
= CHOOSE_P_2 (n
);
350 mpn_hgcd_matrix_init (&M
, n
- p
, tp
);
351 nn
= mpn_hgcd (ap
+ p
, bp
+ p
, n
- p
, &M
, tp
+ matrix_scratch
);
356 t0
= tp
+ matrix_scratch
;
357 ASSERT (M
.n
<= (n
- p
- 1)/2);
358 ASSERT (M
.n
+ p
<= (p
+ n
- 1) / 2);
360 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
361 n
= mpn_hgcd_matrix_adjust (&M
, p
+ nn
, ap
, bp
, p
, t0
);
363 /* By the same analysis as for mpn_hgcd_matrix_mul */
364 ASSERT (M
.n
+ un
<= ualloc
);
366 /* FIXME: This copying could be avoided by some swapping of
367 * pointers. May need more temporary storage, though. */
368 MPN_COPY (t0
, u0
, un
);
370 /* Temporary storage ualloc */
371 un
= hgcd_mul_matrix_vector (&M
, u0
, t0
, u1
, un
, t0
+ un
);
373 ASSERT (un
< ualloc
);
374 ASSERT ( (u0
[un
-1] | u1
[un
-1]) > 0);
378 /* mpn_hgcd has failed. Then either one of a or b is very
379 small, or the difference is very small. Perform one
380 subtraction followed by one division. */
383 ctx
.tp
= tp
+ n
; /* ualloc */
386 /* Temporary storage n */
387 n
= mpn_gcd_subdiv_step (ap
, bp
, n
, 0, mpn_gcdext_hook
, &ctx
, tp
);
395 ASSERT (un
< ualloc
);
398 /* We have A = ... a + ... b
406 |u0|, |u1| <= B / min(a, b)
408 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
409 in which case the only reduction done so far is a = A - k B for
412 Compute g = u a + v b = (u u1 - v u0) A + (...) B
413 Here, u, v are bounded by
419 ASSERT ( (ap
[n
-1] | bp
[n
-1]) > 0);
421 if (UNLIKELY (mpn_cmp (ap
, bp
, n
) == 0))
423 /* Must return the smallest cofactor, +u1 or -u0 */
426 MPN_COPY (gp
, ap
, n
);
428 MPN_CMP (c
, u0
, u1
, un
);
429 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
430 this case we choose the cofactor + 1, corresponding to G = A
431 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
432 ASSERT (c
!= 0 || (un
== 1 && u0
[0] == 1 && u1
[0] == 1));
435 MPN_NORMALIZE (u0
, un
);
436 MPN_COPY (up
, u0
, un
);
441 MPN_NORMALIZE_NOT_ZERO (u1
, un
);
442 MPN_COPY (up
, u1
, un
);
449 else if (UNLIKELY (u0
[0] == 0) && un
== 1)
454 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
455 gn
= mpn_gcdext_lehmer_n (gp
, up
, usizep
, ap
, bp
, n
, tp
);
472 lehmer_up
= tp
; tp
+= n
;
474 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
475 MPN_COPY (tp
, ap
, n
);
476 MPN_COPY (tp
+ n
, bp
, n
);
477 gn
= mpn_gcdext_lehmer_n (gp
, lehmer_up
, &lehmer_un
, tp
, tp
+ n
, n
, tp
+ 2*n
);
480 MPN_NORMALIZE (u0
, u0n
);
485 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
486 MPN_COPY (up
, u0
, u0n
);
494 /* Compute v = (g - u a) / b */
495 lehmer_vn
= compute_v (lehmer_vp
,
496 ap
, bp
, n
, gp
, gn
, lehmer_up
, lehmer_un
, tp
+ n
+ 1);
502 lehmer_un
= -lehmer_un
;
507 MPN_NORMALIZE (u1
, u1n
);
510 ASSERT (lehmer_un
+ u1n
<= ualloc
);
511 ASSERT (lehmer_vn
+ u0n
<= ualloc
);
513 /* We may still have v == 0 */
516 if (lehmer_un
<= u1n
)
517 /* Should be the common case */
518 mpn_mul (up
, u1
, u1n
, lehmer_up
, lehmer_un
);
520 mpn_mul (up
, lehmer_up
, lehmer_un
, u1
, u1n
);
522 un
= u1n
+ lehmer_un
;
523 un
-= (up
[un
- 1] == 0);
529 /* Overwrites old u1 value */
530 if (lehmer_vn
<= u0n
)
531 /* Should be the common case */
532 mpn_mul (u1
, u0
, u0n
, lehmer_vp
, lehmer_vn
);
534 mpn_mul (u1
, lehmer_vp
, lehmer_vn
, u0
, u0n
);
536 u1n
= u0n
+ lehmer_vn
;
537 u1n
-= (u1
[u1n
- 1] == 0);
541 cy
= mpn_add (up
, up
, un
, u1
, u1n
);
545 cy
= mpn_add (up
, u1
, u1n
, up
, un
);
551 ASSERT (un
< ualloc
);
553 *usizep
= negate
? -un
: un
;