1 /* Mulders' MulHigh function (short product)
3 Copyright 2005-2016 Free Software Foundation, Inc.
4 Contributed by the AriC and Caramba projects, INRIA.
6 This file is part of the GNU MPFR Library.
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LESSER. If not, see
20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
24 [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
25 Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
26 July 25-27, 2011, pages 7-14.
29 #define MPFR_NEED_LONGLONG_H
30 #include "mpfr-impl.h"
32 #ifndef MUL_FFT_THRESHOLD
33 #define MUL_FFT_THRESHOLD 8448
36 /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
37 #ifdef MPFR_MULHIGH_TAB_SIZE
38 static short mulhigh_ktab
[MPFR_MULHIGH_TAB_SIZE
];
40 static short mulhigh_ktab
[] = {MPFR_MULHIGH_TAB
};
41 #define MPFR_MULHIGH_TAB_SIZE \
42 ((mp_size_t) (sizeof(mulhigh_ktab) / sizeof(mulhigh_ktab[0])))
45 /* Put in rp[n..2n-1] an approximation of the n high limbs
46 of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
47 approximation is always less or equal to the truncated full product).
48 Assume 2n limbs are allocated at rp.
50 Implements Algorithm ShortMulNaive from [1].
53 mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp
, mpfr_limb_srcptr up
,
54 mpfr_limb_srcptr vp
, mp_size_t n
)
59 umul_ppmm (rp
[1], rp
[0], up
[n
-1], vp
[0]); /* we neglect up[0..n-2]*vp[0],
60 which is less than B^n */
61 for (i
= 1 ; i
< n
; i
++)
62 /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
63 rp
[i
+ 1] = mpn_addmul_1 (rp
, up
+ (n
- i
- 1), i
+ 1, vp
[i
]);
64 /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
67 /* Put in rp[0..n] the n+1 low limbs of {up, n} * {vp, n}.
68 Assume 2n limbs are allocated at rp. */
70 mpfr_mullow_n_basecase (mpfr_limb_ptr rp
, mpfr_limb_srcptr up
,
71 mpfr_limb_srcptr vp
, mp_size_t n
)
75 rp
[n
] = mpn_mul_1 (rp
, up
, n
, vp
[0]);
76 for (i
= 1 ; i
< n
; i
++)
77 mpn_addmul_1 (rp
+ i
, up
, n
- i
+ 1, vp
[i
]);
80 /* Put in rp[n..2n-1] an approximation of the n high limbs
81 of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
82 approximation is always less or equal to the truncated full product).
84 Implements Algorithm ShortMul from [1].
87 mpfr_mulhigh_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mpfr_limb_srcptr mp
,
92 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 8); /* so that 3*(n/4) > n/2 */
93 k
= MPFR_LIKELY (n
< MPFR_MULHIGH_TAB_SIZE
) ? mulhigh_ktab
[n
] : 3*(n
/4);
94 /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
95 into k >= (n+4)/2 in the C language. */
96 MPFR_ASSERTD (k
== -1 || k
== 0 || (k
>= (n
+4)/2 && k
< n
));
98 mpn_mul_basecase (rp
, np
, n
, mp
, n
); /* result is exact, no error */
100 mpfr_mulhigh_n_basecase (rp
, np
, mp
, n
); /* basecase error < n ulps */
101 else if (n
> MUL_FFT_THRESHOLD
)
102 mpn_mul_n (rp
, np
, mp
, n
); /* result is exact, no error */
108 mpn_mul_n (rp
+ 2 * l
, np
+ l
, mp
+ l
, k
); /* fills rp[2l..2n-1] */
109 mpfr_mulhigh_n (rp
, np
+ k
, mp
, l
); /* fills rp[l-1..2l-1] */
110 cy
= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
111 mpfr_mulhigh_n (rp
, np
, mp
+ k
, l
); /* fills rp[l-1..2l-1] */
112 cy
+= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
113 mpn_add_1 (rp
+ n
+ l
, rp
+ n
+ l
, k
, cy
); /* propagate carry */
117 /* Put in rp[0..n] the n+1 low limbs of {np, n} * {mp, n}.
118 Assume 2n limbs are allocated at rp. */
120 mpfr_mullow_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mpfr_limb_srcptr mp
,
125 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 8); /* so that 3*(n/4) > n/2 */
126 k
= MPFR_LIKELY (n
< MPFR_MULHIGH_TAB_SIZE
) ? mulhigh_ktab
[n
] : 3*(n
/4);
127 MPFR_ASSERTD (k
== -1 || k
== 0 || (2 * k
>= n
&& k
< n
));
129 mpn_mul_basecase (rp
, np
, n
, mp
, n
);
131 mpfr_mullow_n_basecase (rp
, np
, mp
, n
);
132 else if (n
> MUL_FFT_THRESHOLD
)
133 mpn_mul_n (rp
, np
, mp
, n
);
138 mpn_mul_n (rp
, np
, mp
, k
); /* fills rp[0..2k] */
139 mpfr_mullow_n (rp
+ n
, np
+ k
, mp
, l
); /* fills rp[n..n+2l] */
140 mpn_add_n (rp
+ k
, rp
+ k
, rp
+ n
, l
+ 1);
141 mpfr_mullow_n (rp
+ n
, np
, mp
+ k
, l
); /* fills rp[n..n+2l] */
142 mpn_add_n (rp
+ k
, rp
+ k
, rp
+ n
, l
+ 1);
146 #ifdef MPFR_SQRHIGH_TAB_SIZE
147 static short sqrhigh_ktab
[MPFR_SQRHIGH_TAB_SIZE
];
149 static short sqrhigh_ktab
[] = {MPFR_SQRHIGH_TAB
};
150 #define MPFR_SQRHIGH_TAB_SIZE (sizeof(sqrhigh_ktab) / sizeof(sqrhigh_ktab[0]))
153 /* Put in rp[n..2n-1] an approximation of the n high limbs
154 of {np, n}^2. The error is less than n ulps of rp[n]. */
156 mpfr_sqrhigh_n (mpfr_limb_ptr rp
, mpfr_limb_srcptr np
, mp_size_t n
)
160 MPFR_ASSERTN (MPFR_SQRHIGH_TAB_SIZE
> 2); /* ensures k < n */
161 k
= MPFR_LIKELY (n
< MPFR_SQRHIGH_TAB_SIZE
) ? sqrhigh_ktab
[n
]
162 : (n
+4)/2; /* ensures that k >= (n+3)/2 */
163 MPFR_ASSERTD (k
== -1 || k
== 0 || (k
>= (n
+4)/2 && k
< n
));
165 /* we can't use mpn_sqr_basecase here, since it requires
166 n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
167 is not exported by GMP */
168 mpn_sqr_n (rp
, np
, n
);
170 mpfr_mulhigh_n_basecase (rp
, np
, np
, n
);
176 mpn_sqr_n (rp
+ 2 * l
, np
+ l
, k
); /* fills rp[2l..2n-1] */
177 mpfr_mulhigh_n (rp
, np
, np
+ k
, l
); /* fills rp[l-1..2l-1] */
178 /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
179 cy
= mpn_lshift (rp
+ l
- 1, rp
+ l
- 1, l
+ 1, 1);
180 cy
+= mpn_add_n (rp
+ n
- 1, rp
+ n
- 1, rp
+ l
- 1, l
+ 1);
181 mpn_add_1 (rp
+ n
+ l
, rp
+ n
+ l
, k
, cy
); /* propagate carry */
185 #ifdef MPFR_DIVHIGH_TAB_SIZE
186 static short divhigh_ktab
[MPFR_DIVHIGH_TAB_SIZE
];
188 static short divhigh_ktab
[] = {MPFR_DIVHIGH_TAB
};
189 #define MPFR_DIVHIGH_TAB_SIZE (sizeof(divhigh_ktab) / sizeof(divhigh_ktab[0]))
192 #ifndef __GMPFR_GMP_H__
193 #define mpfr_pi1_t gmp_pi1_t /* with a GMP build */
196 #if !(defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q))
197 /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
198 with the most significant limb of the quotient as return value (0 or 1).
199 Assumes the most significant bit of D is set. Clobbers N.
201 The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
204 mpfr_divhigh_n_basecase (mpfr_limb_ptr qp
, mpfr_limb_ptr np
,
205 mpfr_limb_srcptr dp
, mp_size_t n
)
207 mp_limb_t qh
, d1
, d0
, dinv
, q2
, q1
, q0
;
212 if ((qh
= (mpn_cmp (np
, dp
, n
) >= 0)))
213 mpn_sub_n (np
, np
, dp
, n
);
215 /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
221 invert_limb (dinv
, d1
);
222 umul_ppmm (q1
, q0
, np
[0], dinv
);
229 invert_pi1 (dinv2
, d1
, d0
);
230 /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
233 /* Invariant: it remains to reduce n limbs from N (in addition to the
234 initial low n limbs).
235 Since n >= 2 here, necessarily we had n >= 2 initially, which means
236 that in addition to the limb np[n-1] to reduce, we have at least 2
237 extra limbs, thus accessing np[n-3] is valid. */
239 /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here,
240 since we truncate the divisor at each step, but since {np,n} < D
241 originally, the largest possible partial quotient is B-1. */
242 if (MPFR_UNLIKELY(np
[n
-1] > d1
|| (np
[n
-1] == d1
&& np
[n
-2] >= d0
)))
243 q2
= ~ (mp_limb_t
) 0;
245 udiv_qr_3by2 (q2
, q1
, q0
, np
[n
- 1], np
[n
- 2], np
[n
- 3],
246 d1
, d0
, dinv2
.inv32
);
247 /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
248 we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
249 thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
250 and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
251 thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
252 which proves that at most one correction is needed */
253 q0
= mpn_submul_1 (np
- 1, dp
, n
, q2
);
254 if (MPFR_UNLIKELY(q0
> np
[n
- 1]))
256 mpn_add_n (np
- 1, np
- 1, dp
, n
);
263 /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
264 q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
265 <= floor((np[0]*B+np[1])/d1)
266 thus q1 is not larger than the true quotient.
267 q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
268 For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
269 thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
270 (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
271 d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
272 thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
274 For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
277 In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
280 umul_ppmm (q1
, q0
, np
[0], dinv2
.inv32
);
287 /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
288 with the most significant limb of the quotient as return value (0 or 1).
289 Assumes the most significant bit of D is set. Clobbers N.
291 This implements the ShortDiv algorithm from reference [1].
295 mpfr_divhigh_n (mpfr_limb_ptr qp
, mpfr_limb_ptr np
, mpfr_limb_ptr dp
,
301 MPFR_TMP_DECL(marker
);
303 MPFR_ASSERTN (MPFR_MULHIGH_TAB_SIZE
>= 15); /* so that 2*(n/3) >= (n+4)/2 */
304 k
= MPFR_LIKELY (n
< MPFR_DIVHIGH_TAB_SIZE
) ? divhigh_ktab
[n
] : 2*(n
/3);
307 #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
310 invert_pi1 (dinv2
, dp
[n
- 1], dp
[n
- 2]);
311 return __gmpn_sbpi1_divappr_q (qp
, np
, n
+ n
, dp
, n
, dinv2
.inv32
);
313 #else /* use our own code for base-case short division */
314 return mpfr_divhigh_n_basecase (qp
, np
, dp
, n
);
317 /* for k=n, we use a division with remainder (mpn_divrem),
318 which computes the exact quotient */
319 return mpn_divrem (qp
, 0, np
, 2 * n
, dp
, n
);
321 MPFR_ASSERTD ((n
+4)/2 <= k
&& k
< n
); /* bounds from [1] */
322 MPFR_TMP_MARK (marker
);
324 /* first divide the most significant 2k limbs from N by the most significant
326 qh
= mpn_divrem (qp
+ l
, 0, np
+ 2 * l
, 2 * k
, dp
+ l
, k
); /* exact */
328 /* it remains {np,2l+k} = {np,n+l} as remainder */
330 /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
332 tp
= MPFR_TMP_LIMBS_ALLOC (2 * l
);
333 mpfr_mulhigh_n (tp
, qp
+ k
, dp
, l
);
334 /* we are only interested in the upper l limbs from {tp,2l} */
335 cy
= mpn_sub_n (np
+ n
, np
+ n
, tp
+ l
, l
);
337 cy
+= mpn_sub_n (np
+ n
, np
+ n
, dp
, l
);
338 while (cy
> 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
340 qh
-= mpn_sub_1 (qp
+ l
, qp
+ l
, k
, MPFR_LIMB_ONE
);
341 cy
-= mpn_add_n (np
+ l
, np
+ l
, dp
, n
);
344 /* now it remains {np,n+l} to divide by D */
345 cy
= mpfr_divhigh_n (qp
, np
+ k
, dp
+ k
, l
);
346 qh
+= mpn_add_1 (qp
+ l
, qp
+ l
, k
, cy
);
347 MPFR_TMP_FREE(marker
);
351 #else /* below is the FoldDiv(K) algorithm from [1] */
353 mpfr_divhigh_n (mpfr_limb_ptr qp
, mpfr_limb_ptr np
, mpfr_limb_ptr dp
,
357 mpfr_limb_ptr ip
, tp
, up
;
358 mp_limb_t qh
= 0, cy
, cc
;
360 MPFR_TMP_DECL(marker
);
364 return mpn_divrem (qp
, 0, np
, 2 * n
, dp
, n
);
366 k
= (n
- 1) / K
+ 1; /* ceil(n/K) */
368 MPFR_TMP_MARK (marker
);
369 ip
= MPFR_TMP_LIMBS_ALLOC (k
+ 1);
370 tp
= MPFR_TMP_LIMBS_ALLOC (n
+ k
);
371 up
= MPFR_TMP_LIMBS_ALLOC (2 * (k
+ 1));
372 mpn_invert (ip
, dp
+ n
- (k
+ 1), k
+ 1, NULL
); /* takes about 13% for n=1000 */
373 /* {ip, k+1} = floor((B^(2k+2)-1)/D - B^(k+1) where D = {dp+n-(k+1),k+1} */
374 for (r
= n
, cc
= 0UL; r
> 0;)
376 /* cc is the carry at np[n+r] */
377 MPFR_ASSERTD(cc
<= 1);
378 /* FIXME: why can we have cc as large as say 8? */
383 MPFR_ASSERTD(count
<= 1);
384 /* subtract {dp+n-r,r} from {np+n,r} */
385 cc
-= mpn_sub_n (np
+ n
, np
+ n
, dp
+ n
- r
, r
);
387 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, 1UL);
389 /* it remains r limbs to reduce, i.e., the remainder is {np, n+r} */
396 /* qp + r - 2 * k -> up */
397 mpfr_mulhigh_n (up
, np
+ n
+ r
- (k
+ 1), ip
, k
+ 1);
398 /* take into account the term B^k in the inverse: B^k * {np+n+r-k, k} */
399 cy
= mpn_add_n (qp
+ r
- k
, up
+ k
+ 2, np
+ n
+ r
- k
, k
);
400 /* since we need only r limbs of tp (below), it suffices to consider
401 r high limbs of dp */
405 mpn_mul (tp
, dp
+ n
- r
, r
, qp
+ r
- k
, k
);
406 #else /* use a short product for the low k x k limbs */
407 /* we know the upper k limbs of the r-limb product cancel with the
408 remainder, thus we only need to compute the low r-k limbs */
410 mpn_mul (tp
+ k
, dp
+ n
- r
+ k
, r
- k
, qp
+ r
- k
, k
);
415 mpn_mul (tp
+ k
, qp
+ r
- k
, k
, dp
+ n
- r
+ k
, r
- k
);
417 mpfr_mullow_n_basecase (tp
+ k
, qp
+ r
- k
, dp
+ n
- r
+ k
, r
- k
);
418 /* take into account qp[2r-2k] * dp[n - r + k] */
419 tp
[r
] += qp
[2*r
-2*k
] * dp
[n
- r
+ k
];
421 /* tp[k..r] is filled */
424 mpfr_mulhigh_n (up
, dp
+ n
- r
, qp
+ r
- k
, k
);
425 #else /* compute one more limb. FIXME: we could add one limb of dp in the
426 above, to save one mpn_addmul_1 call */
427 mpfr_mulhigh_n (up
, dp
+ n
- r
, qp
+ r
- k
, k
- 1); /* {up,2k-2} */
428 /* add {qp + r - k, k - 1} * dp[n-r+k-1] */
429 up
[2*k
-2] = mpn_addmul_1 (up
+ k
- 1, qp
+ r
- k
, k
-1, dp
[n
-r
+k
-1]);
430 /* add {dp+n-r, k} * qp[r-1] */
431 up
[2*k
-1] = mpn_addmul_1 (up
+ k
- 1, dp
+ n
- r
, k
, qp
[r
-1]);
434 cc
= mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, k
);
435 mpn_add_1 (tp
+ 2 * k
, tp
+ 2 * k
, r
- k
, cc
);
437 /* update tp[k..r] */
439 mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, r
- k
+ 1);
440 else /* r - k >= k */
442 cc
= mpn_add_n (tp
+ k
, tp
+ k
, up
+ k
, k
);
443 mpn_add_1 (tp
+ 2 * k
, tp
+ 2 * k
, r
- 2 * k
+ 1, cc
);
448 else /* last step: since we only want the quotient, no need to update,
449 just propagate the carry cy */
453 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, cy
);
456 /* subtract {tp, n+k} from {np+r-k, n+k}; however we only want to
458 /* we should have tp[r] = np[n+r-k] up to 1 */
459 MPFR_ASSERTD(tp
[r
] == np
[n
+ r
- k
] || tp
[r
] + 1 == np
[n
+ r
- k
]);
461 cc
= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, tp
+ k
- 1, r
+ 1); /* borrow at np[n+r] */
463 cc
= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, tp
+ k
- 1, r
- k
+ 2);
465 /* if cy = 1, subtract {dp, n} from {np+r, n}, thus
466 {dp+n-r,r} from {np+n,r} */
470 cc
+= mpn_sub_n (np
+ n
- 1, np
+ n
- 1, dp
+ n
- r
- 1, r
+ 1);
472 cc
+= mpn_sub_n (np
+ n
, np
+ n
, dp
+ n
- r
, r
);
477 qh
+= mpn_add_1 (qp
+ r
, qp
+ r
, n
- r
, cy
);
479 /* cc is the borrow at np[n+r] */
481 while (cc
> 0) /* quotient was too large */
484 MPFR_ASSERTD (count
<= 1);
485 cy
= mpn_add_n (np
+ n
, np
+ n
, dp
+ n
- (r
- k
), r
- k
);
486 cc
-= mpn_add_1 (np
+ n
+ r
- k
, np
+ n
+ r
- k
, k
, cy
);
487 qh
-= mpn_sub_1 (qp
+ r
- k
, qp
+ r
- k
, n
- (r
- k
), 1UL);
492 MPFR_TMP_FREE(marker
);