beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / div_qr_1n_pi1.c
blob2656e9aef9cf98efac2bf4a400e659b6037ef637
1 /* mpn_div_qr_1n_pi1
3 Contributed to the GNU project by Niels Möller
5 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2013 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
27 or both in parallel, as here.
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 #include "longlong.h"
42 #if GMP_NAIL_BITS > 0
43 #error Nail bits not supported
44 #endif
46 #ifndef DIV_QR_1N_METHOD
47 #define DIV_QR_1N_METHOD 2
48 #endif
50 /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
52 #if defined (__GNUC__) && ! defined (NO_ASM)
54 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
55 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
56 __asm__ ( "add %6, %k2\n\t" \
57 "adc %4, %k1\n\t" \
58 "sbb %k0, %k0" \
59 : "=r" (m), "=r" (s1), "=&r" (s0) \
60 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
61 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
62 #endif
64 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
65 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
66 __asm__ ( "add %6, %q2\n\t" \
67 "adc %4, %q1\n\t" \
68 "sbb %q0, %q0" \
69 : "=r" (m), "=r" (s1), "=&r" (s0) \
70 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
71 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
72 #endif
74 #if defined (__sparc__) && W_TYPE_SIZE == 32
75 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
76 __asm__ ( "addcc %r5, %6, %2\n\t" \
77 "addxcc %r3, %4, %1\n\t" \
78 "subx %%g0, %%g0, %0" \
79 : "=r" (m), "=r" (sh), "=&r" (sl) \
80 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
81 __CLOBBER_CC)
82 #endif
84 #if defined (__sparc__) && W_TYPE_SIZE == 64
85 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
86 __asm__ ( "addcc %r5, %6, %2\n\t" \
87 "addccc %r7, %8, %%g0\n\t" \
88 "addccc %r3, %4, %1\n\t" \
89 "clr %0\n\t" \
90 "movcs %%xcc, -1, %0" \
91 : "=r" (m), "=r" (sh), "=&r" (sl) \
92 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
93 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
94 __CLOBBER_CC)
95 #if __VIS__ >= 0x300
96 #undef add_mssaaaa
97 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
98 __asm__ ( "addcc %r5, %6, %2\n\t" \
99 "addxccc %r3, %4, %1\n\t" \
100 "clr %0\n\t" \
101 "movcs %%xcc, -1, %0" \
102 : "=r" (m), "=r" (sh), "=&r" (sl) \
103 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
104 __CLOBBER_CC)
105 #endif
106 #endif
108 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
109 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
110 processor running in 32-bit mode, since the carry flag then gets the 32-bit
111 carry. */
112 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
113 __asm__ ( "add%I6c %2, %5, %6\n\t" \
114 "adde %1, %3, %4\n\t" \
115 "subfe %0, %0, %0\n\t" \
116 "nor %0, %0, %0" \
117 : "=r" (m), "=r" (s1), "=&r" (s0) \
118 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
119 #endif
121 #if defined (__s390x__) && W_TYPE_SIZE == 64
122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
123 __asm__ ( "algr %2, %6\n\t" \
124 "alcgr %1, %4\n\t" \
125 "lghi %0, 0\n\t" \
126 "alcgr %0, %0\n\t" \
127 "lcgr %0, %0" \
128 : "=r" (m), "=r" (s1), "=&r" (s0) \
129 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
130 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
131 #endif
133 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
134 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
135 __asm__ ( "adds %2, %5, %6\n\t" \
136 "adcs %1, %3, %4\n\t" \
137 "movcc %0, #0\n\t" \
138 "movcs %0, #-1" \
139 : "=r" (m), "=r" (sh), "=&r" (sl) \
140 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
141 #endif
142 #endif /* defined (__GNUC__) */
144 #ifndef add_mssaaaa
145 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
146 do { \
147 UWtype __s0, __s1, __c0, __c1; \
148 __s0 = (a0) + (b0); \
149 __s1 = (a1) + (b1); \
150 __c0 = __s0 < (a0); \
151 __c1 = __s1 < (a1); \
152 (s0) = __s0; \
153 __s1 = __s1 + __c0; \
154 (s1) = __s1; \
155 (m) = - (__c1 + (__s1 < __c0)); \
156 } while (0)
157 #endif
159 #if DIV_QR_1N_METHOD == 1
161 /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
162 Requires that uh < d. */
163 mp_limb_t
164 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
165 mp_limb_t d, mp_limb_t dinv)
167 ASSERT (n > 0);
168 ASSERT (uh < d);
169 ASSERT (d & GMP_NUMB_HIGHBIT);
170 ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
174 mp_limb_t q, ul;
176 ul = up[--n];
177 udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
178 qp[n] = q;
180 while (n > 0);
182 return uh;
185 #elif DIV_QR_1N_METHOD == 2
187 mp_limb_t
188 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
189 mp_limb_t d, mp_limb_t dinv)
191 mp_limb_t B2;
192 mp_limb_t u0, u2;
193 mp_limb_t q0, q1;
194 mp_limb_t p0, p1;
195 mp_limb_t t;
196 mp_size_t j;
198 ASSERT (d & GMP_LIMB_HIGHBIT);
199 ASSERT (n > 0);
200 ASSERT (u1 < d);
202 if (n == 1)
204 udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
205 return u1;
208 /* FIXME: Could be precomputed */
209 B2 = -d*dinv;
211 umul_ppmm (q1, q0, dinv, u1);
212 umul_ppmm (p1, p0, B2, u1);
213 q1 += u1;
214 ASSERT (q1 >= u1);
215 u0 = up[n-1]; /* Early read, to allow qp == up. */
216 qp[n-1] = q1;
218 add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
220 /* FIXME: Keep q1 in a variable between iterations, to reduce number
221 of memory accesses. */
222 for (j = n-2; j-- > 0; )
224 mp_limb_t q2, cy;
226 /* Additions for the q update:
227 * +-------+
228 * |u1 * v |
229 * +---+---+
230 * | u1|
231 * +---+---+
232 * | 1 | v | (conditional on u2)
233 * +---+---+
234 * | 1 | (conditional on u0 + u2 B2 carry)
235 * +---+
236 * + | q0|
237 * -+---+---+---+
238 * | q2| q1| q0|
239 * +---+---+---+
241 umul_ppmm (p1, t, u1, dinv);
242 add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
243 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
244 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
245 q0 = t;
247 umul_ppmm (p1, p0, u1, B2);
248 ADDC_LIMB (cy, u0, u0, u2 & B2);
249 u0 -= (-cy) & d;
251 /* Final q update */
252 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
253 qp[j+1] = q1;
254 MPN_INCR_U (qp+j+2, n-j-2, q2);
256 add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
259 q1 = (u2 > 0);
260 u1 -= (-q1) & d;
262 t = (u1 >= d);
263 q1 += t;
264 u1 -= (-t) & d;
266 udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
267 add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
269 MPN_INCR_U (qp+1, n-1, q1);
271 qp[0] = q0;
272 return u0;
275 #else
276 #error Unknown DIV_QR_1N_METHOD
277 #endif