beta-0.89.2
[luatex.git] / source / libs / mpfr / mpfr-3.1.3 / src / mulders.c
blob8f23d21b678bc48ceec8de6f2e07e962d31b7828
1 /* Mulders' MulHigh function (short product)
3 Copyright 2005-2015 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramel projects, INRIA.
6 This file is part of the GNU MPFR Library.
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
23 /* References:
24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26 July 25-27, 2011, pages 7-14.
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
32 #ifndef MUL_FFT_THRESHOLD
33 #define MUL_FFT_THRESHOLD 8448
34 #endif
36 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
37 #ifdef MPFR_MULHIGH_TAB_SIZE
38 static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
39 #else
40 static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
41 #define MPFR_MULHIGH_TAB_SIZE \
42 ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
43 #endif
45 /* Put in rp[n..2n-1] an approximation of the n high limbs
46 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
47 approximation is always less or equal to the truncated full product).
48 Assume 2n limbs are allocated at rp.
50 Implements Algorithm ShortMulNaive from [1].
52 static void
53 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
54 mpfr_limb_srcptr vp, mp_size_t n)
56 mp_size_t i;
58 rp += n - 1;
59 umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
60 which is less than B^n */
61 for (i = 1 ; i < n ; i++)
62 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
63 rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
64 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
67 /* Put in rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
68 Assume 2n limbs are allocated at rp. */
69 static void
70 mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
71 mpfr_limb_srcptr vp, mp_size_t n)
73 mp_size_t i;
75 rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
76 for (i = 1 ; i < n ; i++)
77 mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
80 /* Put in rp[n..2n-1] an approximation of the n high limbs
81 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
82 approximation is always less or equal to the truncated full product).
84 Implements Algorithm ShortMul from [1].
86 void
87 mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
88 mp_size_t n)
90 mp_size_t k;
92 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
93 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
94 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
95 into k >= (n+4)/2 in the C language. */
96 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
97 if (k < 0)
98 mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
99 else if (k == 0)
100 mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
101 else if (n > MUL_FFT_THRESHOLD)
102 mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
103 else
105 mp_size_t l = n - k;
106 mp_limb_t cy;
108 mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
109 mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */
110 cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
111 mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */
112 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
113 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
117 /* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
118 Assume 2n limbs are allocated at rp. */
119 void
120 mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
121 mp_size_t n)
123 mp_size_t k;
125 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
126 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
127 MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
128 if (k < 0)
129 mpn_mul_basecase (rp, np, n, mp, n);
130 else if (k == 0)
131 mpfr_mullow_n_basecase (rp, np, mp, n);
132 else if (n > MUL_FFT_THRESHOLD)
133 mpn_mul_n (rp, np, mp, n);
134 else
136 mp_size_t l = n - k;
138 mpn_mul_n (rp, np, mp, k); /* fills rp[0..2k] */
139 mpfr_mullow_n (rp + n, np + k, mp, l); /* fills rp[n..n+2l] */
140 mpn_add_n (rp + k, rp + k, rp + n, l + 1);
141 mpfr_mullow_n (rp + n, np, mp + k, l); /* fills rp[n..n+2l] */
142 mpn_add_n (rp + k, rp + k, rp + n, l + 1);
146 #ifdef MPFR_SQRHIGH_TAB_SIZE
147 static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
148 #else
149 static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
150 #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
151 #endif
153 /* Put in rp[n..2n-1] an approximation of the n high limbs
154 of {np, n}^2. The error is less than n ulps of rp[n]. */
155 void
156 mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
158 mp_size_t k;
160 MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
161 k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
162 : (n+4)/2; /* ensures that k >= (n+3)/2 */
163 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
164 if (k < 0)
165 /* we can't use mpn_sqr_basecase here, since it requires
166 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
167 is not exported by GMP */
168 mpn_sqr_n (rp, np, n);
169 else if (k == 0)
170 mpfr_mulhigh_n_basecase (rp, np, np, n);
171 else
173 mp_size_t l = n - k;
174 mp_limb_t cy;
176 mpn_sqr_n (rp + 2 * l, np + l, k); /* fills rp[2l..2n-1] */
177 mpfr_mulhigh_n (rp, np, np + k, l); /* fills rp[l-1..2l-1] */
178 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
179 cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
180 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
181 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
185 #ifdef MPFR_DIVHIGH_TAB_SIZE
186 static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
187 #else
188 static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
189 #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
190 #endif
192 #ifndef __GMPFR_GMP_H__
193 #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
194 #endif
196 #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
197 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
198 with the most significant limb of the quotient as return value (0 or 1).
199 Assumes the most significant bit of D is set. Clobbers N.
201 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
203 static mp_limb_t
204 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
205 mpfr_limb_srcptr dp, mp_size_t n)
207 mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
208 mpfr_pi1_t dinv2;
210 np += n;
212 if ((qh = (mpn_cmp (np, dp, n) >= 0)))
213 mpn_sub_n (np, np, dp, n);
215 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
217 d1 = dp[n - 1];
219 if (n == 1)
221 invert_limb (dinv, d1);
222 umul_ppmm (q1, q0, np[0], dinv);
223 qp[0] = np[0] + q1;
224 return qh;
227 /* now n >= 2 */
228 d0 = dp[n - 2];
229 invert_pi1 (dinv2, d1, d0);
230 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
231 while (n > 1)
233 /* Invariant: it remains to reduce n limbs from N (in addition to the
234 initial low n limbs).
235 Since n >= 2 here, necessarily we had n >= 2 initially, which means
236 that in addition to the limb np[n-1] to reduce, we have at least 2
237 extra limbs, thus accessing np[n-3] is valid. */
239 /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
240 the largest possible partial quotient is B-1 */
241 if (MPFR_UNLIKELY(np[n - 1] == d1 && np[n - 2] == d0))
242 q2 = ~ (mp_limb_t) 0;
243 else
244 udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
245 d1, d0, dinv2.inv32);
246 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
247 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
248 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
249 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
250 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
251 which proves that at most one correction is needed */
252 q0 = mpn_submul_1 (np - 1, dp, n, q2);
253 if (MPFR_UNLIKELY(q0 > np[n - 1]))
255 mpn_add_n (np - 1, np - 1, dp, n);
256 q2 --;
258 qp[--n] = q2;
259 dp ++;
262 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
263 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
264 <= floor((np[0]*B+np[1])/d1)
265 thus q1 is not larger than the true quotient.
266 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
267 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
268 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
269 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
270 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
271 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
273 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
274 np[0]*B/d1 - 2.
276 In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
277 q - 4 <= q1 <= q
279 umul_ppmm (q1, q0, np[0], dinv2.inv32);
280 qp[0] = np[0] + q1;
282 return qh;
284 #endif
286 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
287 with the most significant limb of the quotient as return value (0 or 1).
288 Assumes the most significant bit of D is set. Clobbers N.
290 This implements the ShortDiv algorithm from reference [1].
292 #if 1
293 mp_limb_t
294 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
295 mp_size_t n)
297 mp_size_t k, l;
298 mp_limb_t qh, cy;
299 mpfr_limb_ptr tp;
300 MPFR_TMP_DECL(marker);
302 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
303 k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
305 if (k == 0)
306 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
308 mpfr_pi1_t dinv2;
309 invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
310 return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
312 #else /* use our own code for base-case short division */
313 return mpfr_divhigh_n_basecase (qp, np, dp, n);
314 #endif
315 else if (k == n)
316 /* for k=n, we use a division with remainder (mpn_divrem),
317 which computes the exact quotient */
318 return mpn_divrem (qp, 0, np, 2 * n, dp, n);
320 MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
321 MPFR_TMP_MARK (marker);
322 l = n - k;
323 /* first divide the most significant 2k limbs from N by the most significant
324 k limbs of D */
325 qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
327 /* it remains {np,2l+k} = {np,n+l} as remainder */
329 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
330 D0={dp,l} */
331 tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
332 mpfr_mulhigh_n (tp, qp + k, dp, l);
333 /* we are only interested in the upper l limbs from {tp,2l} */
334 cy = mpn_sub_n (np + n, np + n, tp + l, l);
335 if (qh)
336 cy += mpn_sub_n (np + n, np + n, dp, l);
337 while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
339 qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
340 cy -= mpn_add_n (np + l, np + l, dp, n);
343 /* now it remains {np,n+l} to divide by D */
344 cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
345 qh += mpn_add_1 (qp + l, qp + l, k, cy);
346 MPFR_TMP_FREE(marker);
348 return qh;
350 #else /* below is the FoldDiv(K) algorithm from [1] */
351 mp_limb_t
352 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
353 mp_size_t n)
355 mp_size_t k, r;
356 mpfr_limb_ptr ip, tp, up;
357 mp_limb_t qh = 0, cy, cc;
358 int count;
359 MPFR_TMP_DECL(marker);
361 #define K 3
362 if (n < K)
363 return mpn_divrem (qp, 0, np, 2 * n, dp, n);
365 k = (n - 1) / K + 1; /* ceil(n/K) */
367 MPFR_TMP_MARK (marker);
368 ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
369 tp = MPFR_TMP_LIMBS_ALLOC (n + k);
370 up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
371 mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
372 /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
373 for (r = n, cc = 0UL; r > 0;)
375 /* cc is the carry at np[n+r] */
376 MPFR_ASSERTD(cc <= 1);
377 /* FIXME: why can we have cc as large as say 8? */
378 count = 0;
379 while (cc > 0)
381 count ++;
382 MPFR_ASSERTD(count <= 1);
383 /* subtract {dp+n-r,r} from {np+n,r} */
384 cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
385 /* add 1 at qp[r] */
386 qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
388 /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
389 if (r < k)
391 ip += k - r;
392 k = r;
394 /* now r >= k */
395 /* qp + r - 2 * k -> up */
396 mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
397 /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
398 cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
399 /* since we need only r limbs of tp (below), it suffices to consider
400 r high limbs of dp */
401 if (r > k)
403 #if 0
404 mpn_mul (tp, dp + n - r, r, qp + r - k, k);
405 #else /* use a short product for the low k x k limbs */
406 /* we know the upper k limbs of the r-limb product cancel with the
407 remainder, thus we only need to compute the low r-k limbs */
408 if (r - k >= k)
409 mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
410 else /* r-k < k */
412 /* #define LOW */
413 #ifndef LOW
414 mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
415 #else
416 mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
417 /* take into account qp[2r-2k] * dp[n - r + k] */
418 tp[r] += qp[2*r-2*k] * dp[n - r + k];
419 #endif
420 /* tp[k..r] is filled */
422 #if 0
423 mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
424 #else /* compute one more limb. FIXME: we could add one limb of dp in the
425 above, to save one mpn_addmul_1 call */
426 mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
427 /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
428 up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
429 /* add {dp+n-r, k} * qp[r-1] */
430 up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
431 #endif
432 #ifndef LOW
433 cc = mpn_add_n (tp + k, tp + k, up + k, k);
434 mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
435 #else
436 /* update tp[k..r] */
437 if (r - k + 1 <= k)
438 mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
439 else /* r - k >= k */
441 cc = mpn_add_n (tp + k, tp + k, up + k, k);
442 mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
444 #endif
445 #endif
447 else /* last step: since we only want the quotient, no need to update,
448 just propagate the carry cy */
450 MPFR_ASSERTD(r < n);
451 if (cy > 0)
452 qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
453 break;
455 /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
456 update {np+n, n} */
457 /* we should have tp[r] = np[n+r-k] up to 1 */
458 MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
459 #ifndef LOW
460 cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
461 #else
462 cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
463 #endif
464 /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
465 {dp+n-r,r} from {np+n,r} */
466 if (cy)
468 if (r < n)
469 cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
470 else
471 cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
472 /* propagate cy */
473 if (r == n)
474 qh = cy;
475 else
476 qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
478 /* cc is the borrow at np[n+r] */
479 count = 0;
480 while (cc > 0) /* quotient was too large */
482 count++;
483 MPFR_ASSERTD (count <= 1);
484 cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
485 cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
486 qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
488 r -= k;
489 cc = np[n + r];
491 MPFR_TMP_FREE(marker);
493 return qh;
495 #endif