new beta-0.90.0
[luatex.git] / source / libs / mpfr / mpfr-src / src / mulders.c
bloba988db84bcb111a56d79f0818238954b36f00c82
1 /* Mulders' MulHigh function (short product)
3 Copyright 2005-2016 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
6 This file is part of the GNU MPFR Library.
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
23 /* References:
24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26 July 25-27, 2011, pages 7-14.
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
32 #ifndef MUL_FFT_THRESHOLD
33 #define MUL_FFT_THRESHOLD 8448
34 #endif
36 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
37 #ifdef MPFR_MULHIGH_TAB_SIZE
38 static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
39 #else
40 static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
41 #define MPFR_MULHIGH_TAB_SIZE \
42 ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
43 #endif
45 /* Put in rp[n..2n-1] an approximation of the n high limbs
46 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
47 approximation is always less or equal to the truncated full product).
48 Assume 2n limbs are allocated at rp.
50 Implements Algorithm ShortMulNaive from [1].
52 static void
53 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
54 mpfr_limb_srcptr vp, mp_size_t n)
56 mp_size_t i;
58 rp += n - 1;
59 umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
60 which is less than B^n */
61 for (i = 1 ; i < n ; i++)
62 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
63 rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
64 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
67 /* Put in rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
68 Assume 2n limbs are allocated at rp. */
69 static void
70 mpfr_mullow_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
71 mpfr_limb_srcptr vp, mp_size_t n)
73 mp_size_t i;
75 rp[n] = mpn_mul_1 (rp, up, n, vp[0]);
76 for (i = 1 ; i < n ; i++)
77 mpn_addmul_1 (rp + i, up, n - i + 1, vp[i]);
80 /* Put in rp[n..2n-1] an approximation of the n high limbs
81 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
82 approximation is always less or equal to the truncated full product).
84 Implements Algorithm ShortMul from [1].
86 void
87 mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
88 mp_size_t n)
90 mp_size_t k;
92 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
93 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
94 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
95 into k >= (n+4)/2 in the C language. */
96 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
97 if (k < 0)
98 mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
99 else if (k == 0)
100 mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
101 else if (n > MUL_FFT_THRESHOLD)
102 mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
103 else
105 mp_size_t l = n - k;
106 mp_limb_t cy;
108 mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
109 mpfr_mulhigh_n (rp, np + k, mp, l); /* fills rp[l-1..2l-1] */
110 cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
111 mpfr_mulhigh_n (rp, np, mp + k, l); /* fills rp[l-1..2l-1] */
112 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
113 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
117 /* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
118 Assume 2n limbs are allocated at rp. */
119 void
120 mpfr_mullow_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
121 mp_size_t n)
123 mp_size_t k;
125 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
126 k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
127 MPFR_ASSERTD (k == -1 || k == 0 || (2 * k >= n && k < n));
128 if (k < 0)
129 mpn_mul_basecase (rp, np, n, mp, n);
130 else if (k == 0)
131 mpfr_mullow_n_basecase (rp, np, mp, n);
132 else if (n > MUL_FFT_THRESHOLD)
133 mpn_mul_n (rp, np, mp, n);
134 else
136 mp_size_t l = n - k;
138 mpn_mul_n (rp, np, mp, k); /* fills rp[0..2k] */
139 mpfr_mullow_n (rp + n, np + k, mp, l); /* fills rp[n..n+2l] */
140 mpn_add_n (rp + k, rp + k, rp + n, l + 1);
141 mpfr_mullow_n (rp + n, np, mp + k, l); /* fills rp[n..n+2l] */
142 mpn_add_n (rp + k, rp + k, rp + n, l + 1);
146 #ifdef MPFR_SQRHIGH_TAB_SIZE
147 static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
148 #else
149 static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
150 #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
151 #endif
153 /* Put in rp[n..2n-1] an approximation of the n high limbs
154 of {np, n}^2. The error is less than n ulps of rp[n]. */
155 void
156 mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
158 mp_size_t k;
160 MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
161 k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
162 : (n+4)/2; /* ensures that k >= (n+3)/2 */
163 MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
164 if (k < 0)
165 /* we can't use mpn_sqr_basecase here, since it requires
166 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
167 is not exported by GMP */
168 mpn_sqr_n (rp, np, n);
169 else if (k == 0)
170 mpfr_mulhigh_n_basecase (rp, np, np, n);
171 else
173 mp_size_t l = n - k;
174 mp_limb_t cy;
176 mpn_sqr_n (rp + 2 * l, np + l, k); /* fills rp[2l..2n-1] */
177 mpfr_mulhigh_n (rp, np, np + k, l); /* fills rp[l-1..2l-1] */
178 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
179 cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
180 cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
181 mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
185 #ifdef MPFR_DIVHIGH_TAB_SIZE
186 static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
187 #else
188 static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
189 #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
190 #endif
192 #ifndef __GMPFR_GMP_H__
193 #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
194 #endif
196 #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
197 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
198 with the most significant limb of the quotient as return value (0 or 1).
199 Assumes the most significant bit of D is set. Clobbers N.
201 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
203 static mp_limb_t
204 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
205 mpfr_limb_srcptr dp, mp_size_t n)
207 mp_limb_t qh, d1, d0, dinv, q2, q1, q0;
208 mpfr_pi1_t dinv2;
210 np += n;
212 if ((qh = (mpn_cmp (np, dp, n) >= 0)))
213 mpn_sub_n (np, np, dp, n);
215 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
217 d1 = dp[n - 1];
219 if (n == 1)
221 invert_limb (dinv, d1);
222 umul_ppmm (q1, q0, np[0], dinv);
223 qp[0] = np[0] + q1;
224 return qh;
227 /* now n >= 2 */
228 d0 = dp[n - 2];
229 invert_pi1 (dinv2, d1, d0);
230 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
231 while (n > 1)
233 /* Invariant: it remains to reduce n limbs from N (in addition to the
234 initial low n limbs).
235 Since n >= 2 here, necessarily we had n >= 2 initially, which means
236 that in addition to the limb np[n-1] to reduce, we have at least 2
237 extra limbs, thus accessing np[n-3] is valid. */
239 /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here,
240 since we truncate the divisor at each step, but since {np,n} < D
241 originally, the largest possible partial quotient is B-1. */
242 if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0)))
243 q2 = ~ (mp_limb_t) 0;
244 else
245 udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
246 d1, d0, dinv2.inv32);
247 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
248 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
249 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
250 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
251 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
252 which proves that at most one correction is needed */
253 q0 = mpn_submul_1 (np - 1, dp, n, q2);
254 if (MPFR_UNLIKELY(q0 > np[n - 1]))
256 mpn_add_n (np - 1, np - 1, dp, n);
257 q2 --;
259 qp[--n] = q2;
260 dp ++;
263 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
264 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
265 <= floor((np[0]*B+np[1])/d1)
266 thus q1 is not larger than the true quotient.
267 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
268 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
269 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
270 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
271 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
272 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
274 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
275 np[0]*B/d1 - 2.
277 In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
278 q - 4 <= q1 <= q
280 umul_ppmm (q1, q0, np[0], dinv2.inv32);
281 qp[0] = np[0] + q1;
283 return qh;
285 #endif
287 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
288 with the most significant limb of the quotient as return value (0 or 1).
289 Assumes the most significant bit of D is set. Clobbers N.
291 This implements the ShortDiv algorithm from reference [1].
293 #if 1
294 mp_limb_t
295 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
296 mp_size_t n)
298 mp_size_t k, l;
299 mp_limb_t qh, cy;
300 mpfr_limb_ptr tp;
301 MPFR_TMP_DECL(marker);
303 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
304 k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
306 if (k == 0)
307 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
309 mpfr_pi1_t dinv2;
310 invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
311 return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
313 #else /* use our own code for base-case short division */
314 return mpfr_divhigh_n_basecase (qp, np, dp, n);
315 #endif
316 else if (k == n)
317 /* for k=n, we use a division with remainder (mpn_divrem),
318 which computes the exact quotient */
319 return mpn_divrem (qp, 0, np, 2 * n, dp, n);
321 MPFR_ASSERTD ((n+4)/2 <= k && k < n); /* bounds from [1] */
322 MPFR_TMP_MARK (marker);
323 l = n - k;
324 /* first divide the most significant 2k limbs from N by the most significant
325 k limbs of D */
326 qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
328 /* it remains {np,2l+k} = {np,n+l} as remainder */
330 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
331 D0={dp,l} */
332 tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
333 mpfr_mulhigh_n (tp, qp + k, dp, l);
334 /* we are only interested in the upper l limbs from {tp,2l} */
335 cy = mpn_sub_n (np + n, np + n, tp + l, l);
336 if (qh)
337 cy += mpn_sub_n (np + n, np + n, dp, l);
338 while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
340 qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
341 cy -= mpn_add_n (np + l, np + l, dp, n);
344 /* now it remains {np,n+l} to divide by D */
345 cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
346 qh += mpn_add_1 (qp + l, qp + l, k, cy);
347 MPFR_TMP_FREE(marker);
349 return qh;
351 #else /* below is the FoldDiv(K) algorithm from [1] */
352 mp_limb_t
353 mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
354 mp_size_t n)
356 mp_size_t k, r;
357 mpfr_limb_ptr ip, tp, up;
358 mp_limb_t qh = 0, cy, cc;
359 int count;
360 MPFR_TMP_DECL(marker);
362 #define K 3
363 if (n < K)
364 return mpn_divrem (qp, 0, np, 2 * n, dp, n);
366 k = (n - 1) / K + 1; /* ceil(n/K) */
368 MPFR_TMP_MARK (marker);
369 ip = MPFR_TMP_LIMBS_ALLOC (k + 1);
370 tp = MPFR_TMP_LIMBS_ALLOC (n + k);
371 up = MPFR_TMP_LIMBS_ALLOC (2 * (k + 1));
372 mpn_invert (ip, dp + n - (k + 1), k + 1, NULL); /* takes about 13% for n=1000 */
373 /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
374 for (r = n, cc = 0UL; r > 0;)
376 /* cc is the carry at np[n+r] */
377 MPFR_ASSERTD(cc <= 1);
378 /* FIXME: why can we have cc as large as say 8? */
379 count = 0;
380 while (cc > 0)
382 count ++;
383 MPFR_ASSERTD(count <= 1);
384 /* subtract {dp+n-r,r} from {np+n,r} */
385 cc -= mpn_sub_n (np + n, np + n, dp + n - r, r);
386 /* add 1 at qp[r] */
387 qh += mpn_add_1 (qp + r, qp + r, n - r, 1UL);
389 /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
390 if (r < k)
392 ip += k - r;
393 k = r;
395 /* now r >= k */
396 /* qp + r - 2 * k -> up */
397 mpfr_mulhigh_n (up, np + n + r - (k + 1), ip, k + 1);
398 /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
399 cy = mpn_add_n (qp + r - k, up + k + 2, np + n + r - k, k);
400 /* since we need only r limbs of tp (below), it suffices to consider
401 r high limbs of dp */
402 if (r > k)
404 #if 0
405 mpn_mul (tp, dp + n - r, r, qp + r - k, k);
406 #else /* use a short product for the low k x k limbs */
407 /* we know the upper k limbs of the r-limb product cancel with the
408 remainder, thus we only need to compute the low r-k limbs */
409 if (r - k >= k)
410 mpn_mul (tp + k, dp + n - r + k, r - k, qp + r - k, k);
411 else /* r-k < k */
413 /* #define LOW */
414 #ifndef LOW
415 mpn_mul (tp + k, qp + r - k, k, dp + n - r + k, r - k);
416 #else
417 mpfr_mullow_n_basecase (tp + k, qp + r - k, dp + n - r + k, r - k);
418 /* take into account qp[2r-2k] * dp[n - r + k] */
419 tp[r] += qp[2*r-2*k] * dp[n - r + k];
420 #endif
421 /* tp[k..r] is filled */
423 #if 0
424 mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k);
425 #else /* compute one more limb. FIXME: we could add one limb of dp in the
426 above, to save one mpn_addmul_1 call */
427 mpfr_mulhigh_n (up, dp + n - r, qp + r - k, k - 1); /* {up,2k-2} */
428 /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
429 up[2*k-2] = mpn_addmul_1 (up + k - 1, qp + r - k, k-1, dp[n-r+k-1]);
430 /* add {dp+n-r, k} * qp[r-1] */
431 up[2*k-1] = mpn_addmul_1 (up + k - 1, dp + n - r, k, qp[r-1]);
432 #endif
433 #ifndef LOW
434 cc = mpn_add_n (tp + k, tp + k, up + k, k);
435 mpn_add_1 (tp + 2 * k, tp + 2 * k, r - k, cc);
436 #else
437 /* update tp[k..r] */
438 if (r - k + 1 <= k)
439 mpn_add_n (tp + k, tp + k, up + k, r - k + 1);
440 else /* r - k >= k */
442 cc = mpn_add_n (tp + k, tp + k, up + k, k);
443 mpn_add_1 (tp + 2 * k, tp + 2 * k, r - 2 * k + 1, cc);
445 #endif
446 #endif
448 else /* last step: since we only want the quotient, no need to update,
449 just propagate the carry cy */
451 MPFR_ASSERTD(r < n);
452 if (cy > 0)
453 qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
454 break;
456 /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
457 update {np+n, n} */
458 /* we should have tp[r] = np[n+r-k] up to 1 */
459 MPFR_ASSERTD(tp[r] == np[n + r - k] || tp[r] + 1 == np[n + r - k]);
460 #ifndef LOW
461 cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r + 1); /* borrow at np[n+r] */
462 #else
463 cc = mpn_sub_n (np + n - 1, np + n - 1, tp + k - 1, r - k + 2);
464 #endif
465 /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
466 {dp+n-r,r} from {np+n,r} */
467 if (cy)
469 if (r < n)
470 cc += mpn_sub_n (np + n - 1, np + n - 1, dp + n - r - 1, r + 1);
471 else
472 cc += mpn_sub_n (np + n, np + n, dp + n - r, r);
473 /* propagate cy */
474 if (r == n)
475 qh = cy;
476 else
477 qh += mpn_add_1 (qp + r, qp + r, n - r, cy);
479 /* cc is the borrow at np[n+r] */
480 count = 0;
481 while (cc > 0) /* quotient was too large */
483 count++;
484 MPFR_ASSERTD (count <= 1);
485 cy = mpn_add_n (np + n, np + n, dp + n - (r - k), r - k);
486 cc -= mpn_add_1 (np + n + r - k, np + n + r - k, k, cy);
487 qh -= mpn_sub_1 (qp + r - k, qp + r - k, n - (r - k), 1UL);
489 r -= k;
490 cc = np[n + r];
492 MPFR_TMP_FREE(marker);
494 return qh;
496 #endif