1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
2 quotient. The divisor is two limbs.
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller
6 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11 Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
28 or both in parallel, as here.
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
43 #ifndef DIV_QR_2_PI2_THRESHOLD
44 /* Disabled unless explicitly tuned. */
45 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
49 #define SANITY_CHECK 0
52 /* Define some longlong.h-style macros, but for wider operations.
53 * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
54 carry-out into an additional sum operand.
55 * add_csaac accepts two addends and a carry in, and generates a sum
56 and a carry out. A little like a "full adder".
58 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
60 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
61 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
62 __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \
63 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
64 : "0" ((USItype)(s2)), \
65 "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
66 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
67 #define add_csaac(co, s, a, b, ci) \
68 __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0" \
69 : "=r" (co), "=r" (s) \
70 : "rm" ((USItype)(ci)), "0" (CNST_LIMB(0)), \
71 "%1" ((USItype)(a)), "g" ((USItype)(b)))
74 #if defined (__amd64__) && W_TYPE_SIZE == 64
75 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
76 __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \
77 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
78 : "0" ((UDItype)(s2)), \
79 "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
80 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
81 #define add_csaac(co, s, a, b, ci) \
82 __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0" \
83 : "=r" (co), "=r" (s) \
84 : "rm" ((UDItype)(ci)), "0" (CNST_LIMB(0)), \
85 "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
88 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
89 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
90 processor running in 32-bit mode, since the carry flag then gets the 32-bit
92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
93 __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0" \
94 : "=r" (s2), "=&r" (s1), "=&r" (s0) \
95 : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
101 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \
103 UWtype __s0, __s1, __c0, __c1; \
104 __s0 = (a0) + (b0); \
105 __s1 = (a1) + (b1); \
106 __c0 = __s0 < (a0); \
107 __c1 = __s1 < (a1); \
109 __s1 = __s1 + __c0; \
111 (s2) += __c1 + (__s1 < __c0); \
116 #define add_csaac(co, s, a, b, ci) \
123 (co) = __c + (__s < (ci)); \
127 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
128 between inputs and outputs are not supported. */
129 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \
131 mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \
132 mp_limb_t _t1, _t0; \
133 mp_limb_t _c, _mask; \
135 umul_ppmm (_q3,_q2a, n3, di1); \
136 umul_ppmm (_q2,_q1, n2, di1); \
137 umul_ppmm (_q2c,_q1c, n3, di0); \
138 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c); \
139 umul_ppmm (_q1d,_q0, n2, di0); \
140 add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d); \
142 add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1)); \
144 /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */ \
145 add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0)); \
146 add_csaac (_c, _q1, _q1, n1, _c); \
147 add_csaac (_c, _q2, _q2, r0, _c); \
148 _q3 = _q3 + r1 + _c; \
150 umul_ppmm (_t1,_t0, _q2, d0); \
151 _t1 += _q2 * d1 + _q3 * d0; \
153 sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \
155 _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \
156 add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \
157 sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \
159 if (UNLIKELY (r1 >= d1)) \
161 if (r1 > d1 || r0 >= d0) \
163 sub_ddmmss (r1, r0, r1, r0, d1, d0); \
164 add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
172 invert_4by2 (mp_ptr di
, mp_limb_t d1
, mp_limb_t d0
)
174 mp_limb_t v1
, v0
, p1
, t1
, t0
, p0
, mask
;
175 invert_limb (v1
, d1
);
177 /* <1, v1> * d1 = <B-1, p1> */
182 mask
= -(mp_limb_t
) (p1
>= d1
);
187 /* <1, v1> * d1 + d0 = <B-1, p1> */
188 umul_ppmm (t1
, p0
, d0
, v1
);
192 if (UNLIKELY (p1
>= d1
))
194 if (p1
> d1
|| p0
>= d0
)
196 sub_ddmmss (p1
, p0
, p1
, p0
, d1
, d0
);
200 sub_ddmmss (p1
, p0
, p1
, p0
, d1
, d0
);
203 /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
204 * with <p1, p0> + <d1, d0> >= B^2.
206 * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
207 * partial remainder after <1, v1> is
209 * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
212 udiv_qr_3by2 (v0
, t1
, t0
, ~p1
, ~p0
, MP_LIMB_T_MAX
, d1
, d0
, v1
);
222 mpn_mul_n (tp
, dp
, di
, 2);
223 ASSERT_ALWAYS (mpn_add_n (tp
+2, tp
+2, dp
, 2) == 0);
224 ASSERT_ALWAYS (tp
[2] == MP_LIMB_T_MAX
);
225 ASSERT_ALWAYS (tp
[3] == MP_LIMB_T_MAX
);
226 ASSERT_ALWAYS (mpn_add_n (tp
, tp
, dp
, 2) == 1);
232 mpn_div_qr_2n_pi2 (mp_ptr qp
, mp_ptr rp
, mp_srcptr np
, mp_size_t nn
,
233 mp_limb_t d1
, mp_limb_t d0
, mp_limb_t di1
, mp_limb_t di0
)
240 ASSERT (d1
& GMP_NUMB_HIGHBIT
);
246 if (r1
>= d1
&& (r1
> d1
|| r0
>= d0
))
248 #if GMP_NAIL_BITS == 0
249 sub_ddmmss (r1
, r0
, r1
, r0
, d1
, d0
);
252 r1
= r1
- d1
- (r0
>> GMP_LIMB_BITS
- 1);
258 for (i
= nn
- 2; i
>= 2; i
-= 2)
260 mp_limb_t n1
, n0
, q1
, q0
;
263 udiv_qr_4by2 (q1
, q0
, r1
, r0
, r1
, r0
, n1
, n0
, d1
, d0
, di1
, di0
);
271 udiv_qr_3by2 (q
, r1
, r0
, r1
, r0
, np
[0], d1
, d0
, di1
);
281 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
282 significant quotient limbs at qp and the 2 long remainder at np.
283 Return the most significant limb of the quotient.
286 1. qp must either not overlap with the input operands at all, or
287 qp >= np + 2 must hold true. (This means that it's possible to put
288 the quotient in the high part of {np,nn}, right above the remainder.
292 mpn_div_qr_2 (mp_ptr qp
, mp_ptr rp
, mp_srcptr np
, mp_size_t nn
,
300 ASSERT (! MPN_OVERLAP_P (qp
, nn
-2, np
, nn
) || qp
>= np
+ 2);
304 d1
= dp
[1]; d0
= dp
[0];
308 if (UNLIKELY (d1
& GMP_NUMB_HIGHBIT
))
310 if (BELOW_THRESHOLD (nn
, DIV_QR_2_PI2_THRESHOLD
))
313 invert_pi1 (dinv
, d1
, d0
);
314 return mpn_div_qr_2n_pi1 (qp
, rp
, np
, nn
, d1
, d0
, dinv
.inv32
);
319 invert_4by2 (di
, d1
, d0
);
320 return mpn_div_qr_2n_pi2 (qp
, rp
, np
, nn
, d1
, d0
, di
[1], di
[0]);
326 count_leading_zeros (shift
, d1
);
327 d1
= (d1
<< shift
) | (d0
>> (GMP_LIMB_BITS
- shift
));
329 invert_pi1 (dinv
, d1
, d0
);
330 return mpn_div_qr_2u_pi1 (qp
, rp
, np
, nn
, d1
, d0
, shift
, dinv
.inv32
);