beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / gcdext.c
blob1c4ff75aab8256a5ee74fa412beab226f4812c52
1 /* mpn_gcdext -- Extended Greatest Common Divisor.
3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
4 Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
21 or both in parallel, as here.
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
32 #include "gmp.h"
33 #include "gmp-impl.h"
34 #include "longlong.h"
36 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
37 the size is returned (if inputs are non-normalized, result may be
38 non-normalized too). Temporary space needed is M->n + n.
40 static size_t
41 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
42 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
44 mp_limb_t ah, bh;
46 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
48 t = u00 * a
49 r = u10 * b
50 r += t;
52 t = u11 * b
53 b = u01 * a
54 b += t;
57 if (M->n >= n)
59 mpn_mul (tp, M->p[0][0], M->n, ap, n);
60 mpn_mul (rp, M->p[1][0], M->n, bp, n);
62 else
64 mpn_mul (tp, ap, n, M->p[0][0], M->n);
65 mpn_mul (rp, bp, n, M->p[1][0], M->n);
68 ah = mpn_add_n (rp, rp, tp, n + M->n);
70 if (M->n >= n)
72 mpn_mul (tp, M->p[1][1], M->n, bp, n);
73 mpn_mul (bp, M->p[0][1], M->n, ap, n);
75 else
77 mpn_mul (tp, bp, n, M->p[1][1], M->n);
78 mpn_mul (bp, ap, n, M->p[0][1], M->n);
80 bh = mpn_add_n (bp, bp, tp, n + M->n);
82 n += M->n;
83 if ( (ah | bh) > 0)
85 rp[n] = ah;
86 bp[n] = bh;
87 n++;
89 else
91 /* Normalize */
92 while ( (rp[n-1] | bp[n-1]) == 0)
93 n--;
96 return n;
99 #define COMPUTE_V_ITCH(n) (2*(n))
101 /* Computes |v| = |(g - u a)| / b, where u may be positive or
102 negative, and v is of the opposite sign. max(a, b) is of size n, u and
103 v at most size n, and v must have space for n+1 limbs. */
104 static mp_size_t
105 compute_v (mp_ptr vp,
106 mp_srcptr ap, mp_srcptr bp, mp_size_t n,
107 mp_srcptr gp, mp_size_t gn,
108 mp_srcptr up, mp_size_t usize,
109 mp_ptr tp)
111 mp_size_t size;
112 mp_size_t an;
113 mp_size_t bn;
114 mp_size_t vn;
116 ASSERT (n > 0);
117 ASSERT (gn > 0);
118 ASSERT (usize != 0);
120 size = ABS (usize);
121 ASSERT (size <= n);
122 ASSERT (up[size-1] > 0);
124 an = n;
125 MPN_NORMALIZE (ap, an);
126 ASSERT (gn <= an);
128 if (an >= size)
129 mpn_mul (tp, ap, an, up, size);
130 else
131 mpn_mul (tp, up, size, ap, an);
133 size += an;
135 if (usize > 0)
137 /* |v| = -v = (u a - g) / b */
139 ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
140 MPN_NORMALIZE (tp, size);
141 if (size == 0)
142 return 0;
144 else
145 { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
146 (g + |u| a) always fits in (|usize| + an) limbs. */
148 ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
149 size -= (tp[size - 1] == 0);
152 /* Now divide t / b. There must be no remainder */
153 bn = n;
154 MPN_NORMALIZE (bp, bn);
155 ASSERT (size >= bn);
157 vn = size + 1 - bn;
158 ASSERT (vn <= n + 1);
160 mpn_divexact (vp, tp, size, bp, bn);
161 vn -= (vp[vn-1] == 0);
163 return vn;
166 /* Temporary storage:
168 Initial division: Quotient of at most an - n + 1 <= an limbs.
170 Storage for u0 and u1: 2(n+1).
172 Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
174 Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
176 When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
178 When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
180 For the lehmer call after the loop, Let T denote
181 GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
182 u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
183 for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
184 sufficient for both operations.
188 /* Optimal choice of p seems difficult. In each iteration the division
189 * of work between hgcd and the updates of u0 and u1 depends on the
190 * current size of the u. It may be desirable to use a different
191 * choice of p in each iteration. Also the input size seems to matter;
192 * choosing p = n / 3 in the first iteration seems to improve
193 * performance slightly for input size just above the threshold, but
194 * degrade performance for larger inputs. */
195 #define CHOOSE_P_1(n) ((n) / 2)
196 #define CHOOSE_P_2(n) ((n) / 3)
198 mp_size_t
199 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
200 mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
202 mp_size_t talloc;
203 mp_size_t scratch;
204 mp_size_t matrix_scratch;
205 mp_size_t ualloc = n + 1;
207 struct gcdext_ctx ctx;
208 mp_size_t un;
209 mp_ptr u0;
210 mp_ptr u1;
212 mp_ptr tp;
214 TMP_DECL;
216 ASSERT (an >= n);
217 ASSERT (n > 0);
218 ASSERT (bp[n-1] > 0);
220 TMP_MARK;
222 /* FIXME: Check for small sizes first, before setting up temporary
223 storage etc. */
224 talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
226 /* For initial division */
227 scratch = an - n + 1;
228 if (scratch > talloc)
229 talloc = scratch;
231 if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
233 /* For hgcd loop. */
234 mp_size_t hgcd_scratch;
235 mp_size_t update_scratch;
236 mp_size_t p1 = CHOOSE_P_1 (n);
237 mp_size_t p2 = CHOOSE_P_2 (n);
238 mp_size_t min_p = MIN(p1, p2);
239 mp_size_t max_p = MAX(p1, p2);
240 matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
241 hgcd_scratch = mpn_hgcd_itch (n - min_p);
242 update_scratch = max_p + n - 1;
244 scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
245 if (scratch > talloc)
246 talloc = scratch;
248 /* Final mpn_gcdext_lehmer_n call. Need space for u and for
249 copies of a and b. */
250 scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
251 + 3*GCDEXT_DC_THRESHOLD;
253 if (scratch > talloc)
254 talloc = scratch;
256 /* Cofactors u0 and u1 */
257 talloc += 2*(n+1);
260 tp = TMP_ALLOC_LIMBS(talloc);
262 if (an > n)
264 mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
266 if (mpn_zero_p (ap, n))
268 MPN_COPY (gp, bp, n);
269 *usizep = 0;
270 TMP_FREE;
271 return n;
275 if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
277 mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
279 TMP_FREE;
280 return gn;
283 MPN_ZERO (tp, 2*ualloc);
284 u0 = tp; tp += ualloc;
285 u1 = tp; tp += ualloc;
287 ctx.gp = gp;
288 ctx.up = up;
289 ctx.usize = usizep;
292 /* For the first hgcd call, there are no u updates, and it makes
293 some sense to use a different choice for p. */
295 /* FIXME: We could trim use of temporary storage, since u0 and u1
296 are not used yet. For the hgcd call, we could swap in the u0
297 and u1 pointers for the relevant matrix elements. */
299 struct hgcd_matrix M;
300 mp_size_t p = CHOOSE_P_1 (n);
301 mp_size_t nn;
303 mpn_hgcd_matrix_init (&M, n - p, tp);
304 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
305 if (nn > 0)
307 ASSERT (M.n <= (n - p - 1)/2);
308 ASSERT (M.n + p <= (p + n - 1) / 2);
310 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
311 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
313 MPN_COPY (u0, M.p[1][0], M.n);
314 MPN_COPY (u1, M.p[1][1], M.n);
315 un = M.n;
316 while ( (u0[un-1] | u1[un-1] ) == 0)
317 un--;
319 else
321 /* mpn_hgcd has failed. Then either one of a or b is very
322 small, or the difference is very small. Perform one
323 subtraction followed by one division. */
324 u1[0] = 1;
326 ctx.u0 = u0;
327 ctx.u1 = u1;
328 ctx.tp = tp + n; /* ualloc */
329 ctx.un = 1;
331 /* Temporary storage n */
332 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
333 if (n == 0)
335 TMP_FREE;
336 return ctx.gn;
339 un = ctx.un;
340 ASSERT (un < ualloc);
344 while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
346 struct hgcd_matrix M;
347 mp_size_t p = CHOOSE_P_2 (n);
348 mp_size_t nn;
350 mpn_hgcd_matrix_init (&M, n - p, tp);
351 nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
352 if (nn > 0)
354 mp_ptr t0;
356 t0 = tp + matrix_scratch;
357 ASSERT (M.n <= (n - p - 1)/2);
358 ASSERT (M.n + p <= (p + n - 1) / 2);
360 /* Temporary storage 2 (p + M->n) <= p + n - 1 */
361 n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
363 /* By the same analysis as for mpn_hgcd_matrix_mul */
364 ASSERT (M.n + un <= ualloc);
366 /* FIXME: This copying could be avoided by some swapping of
367 * pointers. May need more temporary storage, though. */
368 MPN_COPY (t0, u0, un);
370 /* Temporary storage ualloc */
371 un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
373 ASSERT (un < ualloc);
374 ASSERT ( (u0[un-1] | u1[un-1]) > 0);
376 else
378 /* mpn_hgcd has failed. Then either one of a or b is very
379 small, or the difference is very small. Perform one
380 subtraction followed by one division. */
381 ctx.u0 = u0;
382 ctx.u1 = u1;
383 ctx.tp = tp + n; /* ualloc */
384 ctx.un = un;
386 /* Temporary storage n */
387 n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
388 if (n == 0)
390 TMP_FREE;
391 return ctx.gn;
394 un = ctx.un;
395 ASSERT (un < ualloc);
398 /* We have A = ... a + ... b
399 B = u0 a + u1 b
401 a = u1 A + ... B
402 b = -u0 A + ... B
404 with bounds
406 |u0|, |u1| <= B / min(a, b)
408 We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
409 in which case the only reduction done so far is a = A - k B for
410 some k.
412 Compute g = u a + v b = (u u1 - v u0) A + (...) B
413 Here, u, v are bounded by
415 |u| <= b,
416 |v| <= a
419 ASSERT ( (ap[n-1] | bp[n-1]) > 0);
421 if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
423 /* Must return the smallest cofactor, +u1 or -u0 */
424 int c;
426 MPN_COPY (gp, ap, n);
428 MPN_CMP (c, u0, u1, un);
429 /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
430 this case we choose the cofactor + 1, corresponding to G = A
431 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
432 ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
433 if (c < 0)
435 MPN_NORMALIZE (u0, un);
436 MPN_COPY (up, u0, un);
437 *usizep = -un;
439 else
441 MPN_NORMALIZE_NOT_ZERO (u1, un);
442 MPN_COPY (up, u1, un);
443 *usizep = un;
446 TMP_FREE;
447 return n;
449 else if (UNLIKELY (u0[0] == 0) && un == 1)
451 mp_size_t gn;
452 ASSERT (u1[0] == 1);
454 /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
455 gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
457 TMP_FREE;
458 return gn;
460 else
462 mp_size_t u0n;
463 mp_size_t u1n;
464 mp_size_t lehmer_un;
465 mp_size_t lehmer_vn;
466 mp_size_t gn;
468 mp_ptr lehmer_up;
469 mp_ptr lehmer_vp;
470 int negate;
472 lehmer_up = tp; tp += n;
474 /* Call mpn_gcdext_lehmer_n with copies of a and b. */
475 MPN_COPY (tp, ap, n);
476 MPN_COPY (tp + n, bp, n);
477 gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
479 u0n = un;
480 MPN_NORMALIZE (u0, u0n);
481 ASSERT (u0n > 0);
483 if (lehmer_un == 0)
485 /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */
486 MPN_COPY (up, u0, u0n);
487 *usizep = -u0n;
489 TMP_FREE;
490 return gn;
493 lehmer_vp = tp;
494 /* Compute v = (g - u a) / b */
495 lehmer_vn = compute_v (lehmer_vp,
496 ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
498 if (lehmer_un > 0)
499 negate = 0;
500 else
502 lehmer_un = -lehmer_un;
503 negate = 1;
506 u1n = un;
507 MPN_NORMALIZE (u1, u1n);
508 ASSERT (u1n > 0);
510 ASSERT (lehmer_un + u1n <= ualloc);
511 ASSERT (lehmer_vn + u0n <= ualloc);
513 /* We may still have v == 0 */
515 /* Compute u u0 */
516 if (lehmer_un <= u1n)
517 /* Should be the common case */
518 mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
519 else
520 mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
522 un = u1n + lehmer_un;
523 un -= (up[un - 1] == 0);
525 if (lehmer_vn > 0)
527 mp_limb_t cy;
529 /* Overwrites old u1 value */
530 if (lehmer_vn <= u0n)
531 /* Should be the common case */
532 mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
533 else
534 mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
536 u1n = u0n + lehmer_vn;
537 u1n -= (u1[u1n - 1] == 0);
539 if (u1n <= un)
541 cy = mpn_add (up, up, un, u1, u1n);
543 else
545 cy = mpn_add (up, u1, u1n, up, un);
546 un = u1n;
548 up[un] = cy;
549 un += (cy != 0);
551 ASSERT (un < ualloc);
553 *usizep = negate ? -un : un;
555 TMP_FREE;
556 return gn;