beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / div_qr_2.c
bloba60a2e2449494cce336ac3aeb56f8b153cf92261
1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2 quotient. The divisor is two limbs.
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller
6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11 Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
28 or both in parallel, as here.
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
43 #ifndef DIV_QR_2_PI2_THRESHOLD
44 /* Disabled unless explicitly tuned. */
45 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
46 #endif
48 #ifndef SANITY_CHECK
49 #define SANITY_CHECK 0
50 #endif
52 /* Define some longlong.h-style macros, but for wider operations.
53 * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
54 carry-out into an additional sum operand.
55 * add_csaac accepts two addends and a carry in, and generates a sum
56 and a carry out. A little like a "full adder".
58 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
60 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
61 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
62 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
63 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
64 : "0" ((USItype)(s2)), \
65 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
66 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
67 #define add_csaac(co, s, a, b, ci) \
68 __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \
69 : "=r" (co), "=r" (s) \
70 : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \
71 "%1" ((USItype)(a)), "g" ((USItype)(b)))
72 #endif
74 #if defined (__amd64__) && W_TYPE_SIZE == 64
75 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
76 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
77 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
78 : "0" ((UDItype)(s2)), \
79 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
80 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
81 #define add_csaac(co, s, a, b, ci) \
82 __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \
83 : "=r" (co), "=r" (s) \
84 : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \
85 "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
86 #endif
88 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
89 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
90 processor running in 32-bit mode, since the carry flag then gets the 32-bit
91 carry. */
92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
93 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
94 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
95 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
96 #endif
98 #endif /* __GNUC__ */
100 #ifndef add_sssaaaa
101 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
102 do { \
103 UWtype __s0, __s1, __c0, __c1; \
104 __s0 = (a0) + (b0); \
105 __s1 = (a1) + (b1); \
106 __c0 = __s0 < (a0); \
107 __c1 = __s1 < (a1); \
108 (s0) = __s0; \
109 __s1 = __s1 + __c0; \
110 (s1) = __s1; \
111 (s2) += __c1 + (__s1 < __c0); \
112 } while (0)
113 #endif
115 #ifndef add_csaac
116 #define add_csaac(co, s, a, b, ci) \
117 do { \
118 UWtype __s, __c; \
119 __s = (a) + (b); \
120 __c = __s < (a); \
121 __s = __s + (ci); \
122 (s) = __s; \
123 (co) = __c + (__s < (ci)); \
124 } while (0)
125 #endif
127 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
128 between inputs and outputs are not supported. */
129 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
130 do { \
131 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
132 mp_limb_t _t1, _t0; \
133 mp_limb_t _c, _mask; \
135 umul_ppmm (_q3,_q2a, n3, di1); \
136 umul_ppmm (_q2,_q1, n2, di1); \
137 umul_ppmm (_q2c,_q1c, n3, di0); \
138 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \
139 umul_ppmm (_q1d,_q0, n2, di0); \
140 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \
142 add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1)); \
144 /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \
145 add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0)); \
146 add_csaac (_c, _q1, _q1, n1, _c); \
147 add_csaac (_c, _q2, _q2, r0, _c); \
148 _q3 = _q3 + r1 + _c; \
150 umul_ppmm (_t1,_t0, _q2, d0); \
151 _t1 += _q2 * d1 + _q3 * d0; \
153 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
155 _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \
156 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
157 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
159 if (UNLIKELY (r1 >= d1)) \
161 if (r1 > d1 || r0 >= d0) \
163 sub_ddmmss (r1, r0, r1, r0, d1, d0); \
164 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
167 (q1) = _q3; \
168 (q0) = _q2; \
169 } while (0)
171 static void
172 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
174 mp_limb_t v1, v0, p1, t1, t0, p0, mask;
175 invert_limb (v1, d1);
176 p1 = d1 * v1;
177 /* <1, v1> * d1 = <B-1, p1> */
178 p1 += d0;
179 if (p1 < d0)
181 v1--;
182 mask = -(mp_limb_t) (p1 >= d1);
183 p1 -= d1;
184 v1 += mask;
185 p1 -= mask & d1;
187 /* <1, v1> * d1 + d0 = <B-1, p1> */
188 umul_ppmm (t1, p0, d0, v1);
189 p1 += t1;
190 if (p1 < t1)
192 if (UNLIKELY (p1 >= d1))
194 if (p1 > d1 || p0 >= d0)
196 sub_ddmmss (p1, p0, p1, p0, d1, d0);
197 v1--;
200 sub_ddmmss (p1, p0, p1, p0, d1, d0);
201 v1--;
203 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
204 * with <p1, p0> + <d1, d0> >= B^2.
206 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
207 * partial remainder after <1, v1> is
209 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
210 * = <~p1, ~p0, B-1>
212 udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
213 di[0] = v0;
214 di[1] = v1;
216 #if SANITY_CHECK
218 mp_limb_t tp[4];
219 mp_limb_t dp[2];
220 dp[0] = d0;
221 dp[1] = d1;
222 mpn_mul_n (tp, dp, di, 2);
223 ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
224 ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
225 ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
226 ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
228 #endif
231 static mp_limb_t
232 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
233 mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
235 mp_limb_t qh;
236 mp_size_t i;
237 mp_limb_t r1, r0;
239 ASSERT (nn >= 2);
240 ASSERT (d1 & GMP_NUMB_HIGHBIT);
242 r1 = np[nn-1];
243 r0 = np[nn-2];
245 qh = 0;
246 if (r1 >= d1 && (r1 > d1 || r0 >= d0))
248 #if GMP_NAIL_BITS == 0
249 sub_ddmmss (r1, r0, r1, r0, d1, d0);
250 #else
251 r0 = r0 - d0;
252 r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
253 r0 &= GMP_NUMB_MASK;
254 #endif
255 qh = 1;
258 for (i = nn - 2; i >= 2; i -= 2)
260 mp_limb_t n1, n0, q1, q0;
261 n1 = np[i-1];
262 n0 = np[i-2];
263 udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
264 qp[i-1] = q1;
265 qp[i-2] = q0;
268 if (i > 0)
270 mp_limb_t q;
271 udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
272 qp[0] = q;
274 rp[1] = r1;
275 rp[0] = r0;
277 return qh;
281 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
282 significant quotient limbs at qp and the 2 long remainder at np.
283 Return the most significant limb of the quotient.
285 Preconditions:
286 1. qp must either not overlap with the input operands at all, or
287 qp >= np + 2 must hold true. (This means that it's possible to put
288 the quotient in the high part of {np,nn}, right above the remainder.
289 2. nn >= 2. */
291 mp_limb_t
292 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
293 mp_srcptr dp)
295 mp_limb_t d1;
296 mp_limb_t d0;
297 gmp_pi1_t dinv;
299 ASSERT (nn >= 2);
300 ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
301 ASSERT_MPN (np, nn);
302 ASSERT_MPN (dp, 2);
304 d1 = dp[1]; d0 = dp[0];
306 ASSERT (d1 > 0);
308 if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
310 if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
312 gmp_pi1_t dinv;
313 invert_pi1 (dinv, d1, d0);
314 return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
316 else
318 mp_limb_t di[2];
319 invert_4by2 (di, d1, d0);
320 return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
323 else
325 int shift;
326 count_leading_zeros (shift, d1);
327 d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
328 d0 <<= shift;
329 invert_pi1 (dinv, d1, d0);
330 return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);