1 /* Mulders' MulHigh function (short product)
3 Copyright 2005-2015 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramel projects, INRIA.
6 This file is part of the GNU MPFR Library.
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26 July 25-27, 2011, pages 7-14.
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
32 #ifndef MUL_FFT_THRESHOLD
33 #define MUL_FFT_THRESHOLD 8448
36 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
37 #ifdef MPFR_MULHIGH_TAB_SIZE
38 static short mulhigh_ktab
[MPFR_MULHIGH_TAB_SIZE
];
40 static short mulhigh_ktab
[] = {MPFR_MULHIGH_TAB
};
41 #define MPFR_MULHIGH_TAB_SIZE \
42 ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
45 /* Put in rp[n..2n-1] an approximation of the n high limbs
46 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
47 approximation is always less or equal to the truncated full product).
48 Assume 2n limbs are allocated at rp.
50 Implements Algorithm ShortMulNaive from [1].
53 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp
, mpfr_limb_srcptr up
,
54 mpfr_limb_srcptr vp
, mp_size_t n
)
59 umul_ppmm (rp
[1], rp
[0], up
[n
-1], vp
[0]); /* we neglect up[0..n-2]*vp[0],
60 which is less than B^n */
61 for (i
= 1 ; i
< n
; i
++)
62 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
63 rp
[i
+ 1] = mpn_addmul_1 (rp
, up
+ (n
- i
- 1), i
+ 1, vp
[i
]);
64 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
67 /* Put in rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
68 Assume 2n limbs are allocated at rp. */
70 mpfr_mullow_n_basecase (mpfr_limb_ptr rp
, mpfr_limb_srcptr up
,
71 mpfr_limb_srcptr vp
, mp_size_t n
)
75 rp
[n
] = mpn_mul_1 (rp
, up
, n
, vp
[0]);
76 for (i
= 1 ; i
< n
; i
++)
77 mpn_addmul_1 (rp
+ i
, up
, n
- i
+ 1, vp
[i
]);
80 /* Put in rp[n..2n-1] an approximation of the n high limbs
81 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
82 approximation is always less or equal to the truncated full product).
84 Implements Algorithm ShortMul from [1].
87 mpfr_mulhigh_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mpfr_limb_srcptr mp
,
92 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 8); /* so that 3*(n/4) > n/2 */
93 k
= MPFR_LIKELY (n
< MPFR_MULHIGH_TAB_SIZE
) ? mulhigh_ktab
[n
] : 3*(n
/4);
94 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
95 into k >= (n+4)/2 in the C language. */
96 MPFR_ASSERTD (k
== -1 || k
== 0 || (k
>= (n
+4)/2 && k
< n
));
98 mpn_mul_basecase (rp
, np
, n
, mp
, n
); /* result is exact, no error */
100 mpfr_mulhigh_n_basecase (rp
, np
, mp
, n
); /* basecase error < n ulps */
101 else if (n
> MUL_FFT_THRESHOLD
)
102 mpn_mul_n (rp
, np
, mp
, n
); /* result is exact, no error */
108 mpn_mul_n (rp
+ 2 * l
, np
+ l
, mp
+ l
, k
); /* fills rp[2l..2n-1] */
109 mpfr_mulhigh_n (rp
, np
+ k
, mp
, l
); /* fills rp[l-1..2l-1] */
110 cy
= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
111 mpfr_mulhigh_n (rp
, np
, mp
+ k
, l
); /* fills rp[l-1..2l-1] */
112 cy
+= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
113 mpn_add_1 (rp
+ n
+ l
, rp
+ n
+ l
, k
, cy
); /* propagate carry */
117 /* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
118 Assume 2n limbs are allocated at rp. */
120 mpfr_mullow_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mpfr_limb_srcptr mp
,
125 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 8); /* so that 3*(n/4) > n/2 */
126 k
= MPFR_LIKELY (n
< MPFR_MULHIGH_TAB_SIZE
) ? mulhigh_ktab
[n
] : 3*(n
/4);
127 MPFR_ASSERTD (k
== -1 || k
== 0 || (2 * k
>= n
&& k
< n
));
129 mpn_mul_basecase (rp
, np
, n
, mp
, n
);
131 mpfr_mullow_n_basecase (rp
, np
, mp
, n
);
132 else if (n
> MUL_FFT_THRESHOLD
)
133 mpn_mul_n (rp
, np
, mp
, n
);
138 mpn_mul_n (rp
, np
, mp
, k
); /* fills rp[0..2k] */
139 mpfr_mullow_n (rp
+ n
, np
+ k
, mp
, l
); /* fills rp[n..n+2l] */
140 mpn_add_n (rp
+ k
, rp
+ k
, rp
+ n
, l
+ 1);
141 mpfr_mullow_n (rp
+ n
, np
, mp
+ k
, l
); /* fills rp[n..n+2l] */
142 mpn_add_n (rp
+ k
, rp
+ k
, rp
+ n
, l
+ 1);
146 #ifdef MPFR_SQRHIGH_TAB_SIZE
147 static short sqrhigh_ktab
[MPFR_SQRHIGH_TAB_SIZE
];
149 static short sqrhigh_ktab
[] = {MPFR_SQRHIGH_TAB
};
150 #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
153 /* Put in rp[n..2n-1] an approximation of the n high limbs
154 of {np, n}^2. The error is less than n ulps of rp[n]. */
156 mpfr_sqrhigh_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mp_size_t n
)
160 MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE
> 2); /* ensures k < n */
161 k
= MPFR_LIKELY (n
< MPFR_SQRHIGH_TAB_SIZE
) ? sqrhigh_ktab
[n
]
162 : (n
+4)/2; /* ensures that k >= (n+3)/2 */
163 MPFR_ASSERTD (k
== -1 || k
== 0 || (k
>= (n
+4)/2 && k
< n
));
165 /* we can't use mpn_sqr_basecase here, since it requires
166 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
167 is not exported by GMP */
168 mpn_sqr_n (rp
, np
, n
);
170 mpfr_mulhigh_n_basecase (rp
, np
, np
, n
);
176 mpn_sqr_n (rp
+ 2 * l
, np
+ l
, k
); /* fills rp[2l..2n-1] */
177 mpfr_mulhigh_n (rp
, np
, np
+ k
, l
); /* fills rp[l-1..2l-1] */
178 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
179 cy
= mpn_lshift (rp
+ l
- 1, rp
+ l
- 1, l
+ 1, 1);
180 cy
+= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
181 mpn_add_1 (rp
+ n
+ l
, rp
+ n
+ l
, k
, cy
); /* propagate carry */
185 #ifdef MPFR_DIVHIGH_TAB_SIZE
186 static short divhigh_ktab
[MPFR_DIVHIGH_TAB_SIZE
];
188 static short divhigh_ktab
[] = {MPFR_DIVHIGH_TAB
};
189 #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
192 #ifndef __GMPFR_GMP_H__
193 #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
196 #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
197 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
198 with the most significant limb of the quotient as return value (0 or 1).
199 Assumes the most significant bit of D is set. Clobbers N.
201 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
204 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp
, mpfr_limb_ptr np
,
205 mpfr_limb_srcptr dp
, mp_size_t n
)
207 mp_limb_t qh
, d1
, d0
, dinv
, q2
, q1
, q0
;
212 if ((qh
= (mpn_cmp (np
, dp
, n
) >= 0)))
213 mpn_sub_n (np
, np
, dp
, n
);
215 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
221 invert_limb (dinv
, d1
);
222 umul_ppmm (q1
, q0
, np
[0], dinv
);
229 invert_pi1 (dinv2
, d1
, d0
);
230 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
233 /* Invariant: it remains to reduce n limbs from N (in addition to the
234 initial low n limbs).
235 Since n >= 2 here, necessarily we had n >= 2 initially, which means
236 that in addition to the limb np[n-1] to reduce, we have at least 2
237 extra limbs, thus accessing np[n-3] is valid. */
239 /* warning: we can have np[n-1]=d1 and np[n-2]=d0, but since {np,n} < D,
240 the largest possible partial quotient is B-1 */
241 if (MPFR_UNLIKELY(np
[n
- 1] == d1
&& np
[n
- 2] == d0
))
242 q2
= ~ (mp_limb_t
) 0;
244 udiv_qr_3by2 (q2
, q1
, q0
, np
[n
- 1], np
[n
- 2], np
[n
- 3],
245 d1
, d0
, dinv2
.inv32
);
246 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
247 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
248 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
249 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
250 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
251 which proves that at most one correction is needed */
252 q0
= mpn_submul_1 (np
- 1, dp
, n
, q2
);
253 if (MPFR_UNLIKELY(q0
> np
[n
- 1]))
255 mpn_add_n (np
- 1, np
- 1, dp
, n
);
262 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
263 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
264 <= floor((np[0]*B+np[1])/d1)
265 thus q1 is not larger than the true quotient.
266 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
267 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
268 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
269 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
270 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
271 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
273 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
276 In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
279 umul_ppmm (q1
, q0
, np
[0], dinv2
.inv32
);
286 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
287 with the most significant limb of the quotient as return value (0 or 1).
288 Assumes the most significant bit of D is set. Clobbers N.
290 This implements the ShortDiv algorithm from reference [1].
294 mpfr_divhigh_n (mpfr_limb_ptr qp
, mpfr_limb_ptr np
, mpfr_limb_ptr dp
,
300 MPFR_TMP_DECL(marker
);
302 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 15); /* so that 2*(n/3) >= (n+4)/2 */
303 k
= MPFR_LIKELY (n
< MPFR_DIVHIGH_TAB_SIZE
) ? divhigh_ktab
[n
] : 2*(n
/3);
306 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
309 invert_pi1 (dinv2
, dp
[n
- 1], dp
[n
- 2]);
310 return __gmpn_sbpi1_divappr_q (qp
, np
, n
+ n
, dp
, n
, dinv2
.inv32
);
312 #else /* use our own code for base-case short division */
313 return mpfr_divhigh_n_basecase (qp
, np
, dp
, n
);
316 /* for k=n, we use a division with remainder (mpn_divrem),
317 which computes the exact quotient */
318 return mpn_divrem (qp
, 0, np
, 2 * n
, dp
, n
);
320 MPFR_ASSERTD ((n
+4)/2 <= k
&& k
< n
); /* bounds from [1] */
321 MPFR_TMP_MARK (marker
);
323 /* first divide the most significant 2k limbs from N by the most significant
325 qh
= mpn_divrem (qp
+ l
, 0, np
+ 2 * l
, 2 * k
, dp
+ l
, k
); /* exact */
327 /* it remains {np,2l+k} = {np,n+l} as remainder */
329 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
331 tp
= MPFR_TMP_LIMBS_ALLOC (2 * l
);
332 mpfr_mulhigh_n (tp
, qp
+ k
, dp
, l
);
333 /* we are only interested in the upper l limbs from {tp,2l} */
334 cy
= mpn_sub_n (np
+ n
, np
+ n
, tp
+ l
, l
);
336 cy
+= mpn_sub_n (np
+ n
, np
+ n
, dp
, l
);
337 while (cy
> 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
339 qh
-= mpn_sub_1 (qp
+ l
, qp
+ l
, k
, MPFR_LIMB_ONE
);
340 cy
-= mpn_add_n (np
+ l
, np
+ l
, dp
, n
);
343 /* now it remains {np,n+l} to divide by D */
344 cy
= mpfr_divhigh_n (qp
, np
+ k
, dp
+ k
, l
);
345 qh
+= mpn_add_1 (qp
+ l
, qp
+ l
, k
, cy
);
346 MPFR_TMP_FREE(marker
);
350 #else /* below is the FoldDiv(K) algorithm from [1] */
352 mpfr_divhigh_n (mpfr_limb_ptr qp
, mpfr_limb_ptr np
, mpfr_limb_ptr dp
,
356 mpfr_limb_ptr ip
, tp
, up
;
357 mp_limb_t qh
= 0, cy
, cc
;
359 MPFR_TMP_DECL(marker
);
363 return mpn_divrem (qp
, 0, np
, 2 * n
, dp
, n
);
365 k
= (n
- 1) / K
+ 1; /* ceil(n/K) */
367 MPFR_TMP_MARK (marker
);
368 ip
= MPFR_TMP_LIMBS_ALLOC (k
+ 1);
369 tp
= MPFR_TMP_LIMBS_ALLOC (n
+ k
);
370 up
= MPFR_TMP_LIMBS_ALLOC (2 * (k
+ 1));
371 mpn_invert (ip
, dp
+ n
- (k
+ 1), k
+ 1, NULL
); /* takes about 13% for n=1000 */
372 /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
373 for (r
= n
, cc
= 0UL; r
> 0;)
375 /* cc is the carry at np[n+r] */
376 MPFR_ASSERTD(cc
<= 1);
377 /* FIXME: why can we have cc as large as say 8? */
382 MPFR_ASSERTD(count
<= 1);
383 /* subtract {dp+n-r,r} from {np+n,r} */
384 cc
-= mpn_sub_n (np
+ n
, np
+ n
, dp
+ n
- r
, r
);
386 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, 1UL);
388 /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
395 /* qp + r - 2 * k -> up */
396 mpfr_mulhigh_n (up
, np
+ n
+ r
- (k
+ 1), ip
, k
+ 1);
397 /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
398 cy
= mpn_add_n (qp
+ r
- k
, up
+ k
+ 2, np
+ n
+ r
- k
, k
);
399 /* since we need only r limbs of tp (below), it suffices to consider
400 r high limbs of dp */
404 mpn_mul (tp
, dp
+ n
- r
, r
, qp
+ r
- k
, k
);
405 #else /* use a short product for the low k x k limbs */
406 /* we know the upper k limbs of the r-limb product cancel with the
407 remainder, thus we only need to compute the low r-k limbs */
409 mpn_mul (tp
+ k
, dp
+ n
- r
+ k
, r
- k
, qp
+ r
- k
, k
);
414 mpn_mul (tp
+ k
, qp
+ r
- k
, k
, dp
+ n
- r
+ k
, r
- k
);
416 mpfr_mullow_n_basecase (tp
+ k
, qp
+ r
- k
, dp
+ n
- r
+ k
, r
- k
);
417 /* take into account qp[2r-2k] * dp[n - r + k] */
418 tp
[r
] += qp
[2*r
-2*k
] * dp
[n
- r
+ k
];
420 /* tp[k..r] is filled */
423 mpfr_mulhigh_n (up
, dp
+ n
- r
, qp
+ r
- k
, k
);
424 #else /* compute one more limb. FIXME: we could add one limb of dp in the
425 above, to save one mpn_addmul_1 call */
426 mpfr_mulhigh_n (up
, dp
+ n
- r
, qp
+ r
- k
, k
- 1); /* {up,2k-2} */
427 /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
428 up
[2*k
-2] = mpn_addmul_1 (up
+ k
- 1, qp
+ r
- k
, k
-1, dp
[n
-r
+k
-1]);
429 /* add {dp+n-r, k} * qp[r-1] */
430 up
[2*k
-1] = mpn_addmul_1 (up
+ k
- 1, dp
+ n
- r
, k
, qp
[r
-1]);
433 cc
= mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, k
);
434 mpn_add_1 (tp
+ 2 * k
, tp
+ 2 * k
, r
- k
, cc
);
436 /* update tp[k..r] */
438 mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, r
- k
+ 1);
439 else /* r - k >= k */
441 cc
= mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, k
);
442 mpn_add_1 (tp
+ 2 * k
, tp
+ 2 * k
, r
- 2 * k
+ 1, cc
);
447 else /* last step: since we only want the quotient, no need to update,
448 just propagate the carry cy */
452 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, cy
);
455 /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
457 /* we should have tp[r] = np[n+r-k] up to 1 */
458 MPFR_ASSERTD(tp
[r
] == np
[n
+ r
- k
] || tp
[r
] + 1 == np
[n
+ r
- k
]);
460 cc
= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, tp
+ k
- 1, r
+ 1); /* borrow at np[n+r] */
462 cc
= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, tp
+ k
- 1, r
- k
+ 2);
464 /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
465 {dp+n-r,r} from {np+n,r} */
469 cc
+= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, dp
+ n
- r
- 1, r
+ 1);
471 cc
+= mpn_sub_n (np
+ n
, np
+ n
, dp
+ n
- r
, r
);
476 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, cy
);
478 /* cc is the borrow at np[n+r] */
480 while (cc
> 0) /* quotient was too large */
483 MPFR_ASSERTD (count
<= 1);
484 cy
= mpn_add_n (np
+ n
, np
+ n
, dp
+ n
- (r
- k
), r
- k
);
485 cc
-= mpn_add_1 (np
+ n
+ r
- k
, np
+ n
+ r
- k
, k
, cy
);
486 qh
-= mpn_sub_1 (qp
+ r
- k
, qp
+ r
- k
, n
- (r
- k
), 1UL);
491 MPFR_TMP_FREE(marker
);