beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / mod_1_1.c
blob62e2daaf40858215d4b87ececa0bfe85535b730d
1 /* mpn_mod_1_1p (ap, n, b, cps)
2 Divide (ap,,n) by b. Return the single-limb remainder.
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
5 Based on a suggestion by Peter L. Montgomery.
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11 Copyright 2008-2011, 2013 Free Software Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
28 or both in parallel, as here.
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
43 #ifndef MOD_1_1P_METHOD
44 # define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */
45 #endif
47 /* Define some longlong.h-style macros, but for wider operations.
48 * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
49 * carry out, in the form of a mask. */
51 #if defined (__GNUC__) && ! defined (NO_ASM)
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
55 __asm__ ( "add %6, %k2\n\t" \
56 "adc %4, %k1\n\t" \
57 "sbb %k0, %k0" \
58 : "=r" (m), "=r" (s1), "=&r" (s0) \
59 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
60 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
65 __asm__ ( "add %6, %q2\n\t" \
66 "adc %4, %q1\n\t" \
67 "sbb %q0, %q0" \
68 : "=r" (m), "=r" (s1), "=&r" (s0) \
69 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
70 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
75 __asm__ ( "addcc %r5, %6, %2\n\t" \
76 "addxcc %r3, %4, %1\n\t" \
77 "subx %%g0, %%g0, %0" \
78 : "=r" (m), "=r" (sh), "=&r" (sl) \
79 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
80 __CLOBBER_CC)
81 #endif
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
85 __asm__ ( "addcc %r5, %6, %2\n\t" \
86 "addccc %r7, %8, %%g0\n\t" \
87 "addccc %r3, %4, %1\n\t" \
88 "clr %0\n\t" \
89 "movcs %%xcc, -1, %0" \
90 : "=r" (m), "=r" (sh), "=&r" (sl) \
91 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
92 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
93 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
97 __asm__ ( "addcc %r5, %6, %2\n\t" \
98 "addxccc %r3, %4, %1\n\t" \
99 "clr %0\n\t" \
100 "movcs %%xcc, -1, %0" \
101 : "=r" (m), "=r" (sh), "=&r" (sl) \
102 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
103 __CLOBBER_CC)
104 #endif
105 #endif
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109 processor running in 32-bit mode, since the carry flag then gets the 32-bit
110 carry. */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
112 __asm__ ( "add%I6c %2, %5, %6\n\t" \
113 "adde %1, %3, %4\n\t" \
114 "subfe %0, %0, %0\n\t" \
115 "nor %0, %0, %0" \
116 : "=r" (m), "=r" (s1), "=&r" (s0) \
117 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
118 #endif
120 #if defined (__s390x__) && W_TYPE_SIZE == 64
121 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
122 __asm__ ( "algr %2, %6\n\t" \
123 "alcgr %1, %4\n\t" \
124 "lghi %0, 0\n\t" \
125 "alcgr %0, %0\n\t" \
126 "lcgr %0, %0" \
127 : "=r" (m), "=r" (s1), "=&r" (s0) \
128 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
129 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
130 #endif
132 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
133 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
134 __asm__ ( "adds %2, %5, %6\n\t" \
135 "adcs %1, %3, %4\n\t" \
136 "movcc %0, #0\n\t" \
137 "movcs %0, #-1" \
138 : "=r" (m), "=r" (sh), "=&r" (sl) \
139 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
140 #endif
141 #endif /* defined (__GNUC__) */
143 #ifndef add_mssaaaa
144 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
145 do { \
146 UWtype __s0, __s1, __c0, __c1; \
147 __s0 = (a0) + (b0); \
148 __s1 = (a1) + (b1); \
149 __c0 = __s0 < (a0); \
150 __c1 = __s1 < (a1); \
151 (s0) = __s0; \
152 __s1 = __s1 + __c0; \
153 (s1) = __s1; \
154 (m) = - (__c1 + (__s1 < __c0)); \
155 } while (0)
156 #endif
158 #if MOD_1_1P_METHOD == 1
159 void
160 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
162 mp_limb_t bi;
163 mp_limb_t B1modb, B2modb;
164 int cnt;
166 count_leading_zeros (cnt, b);
168 b <<= cnt;
169 invert_limb (bi, b);
171 cps[0] = bi;
172 cps[1] = cnt;
174 B1modb = -b;
175 if (LIKELY (cnt != 0))
176 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
177 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
178 cps[2] = B1modb >> cnt;
180 /* In the normalized case, this can be simplified to
182 * B2modb = - b * bi;
183 * ASSERT (B2modb <= b); // NB: equality iff b = B/2
185 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
186 cps[3] = B2modb >> cnt;
189 mp_limb_t
190 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
192 mp_limb_t rh, rl, bi, ph, pl, r;
193 mp_limb_t B1modb, B2modb;
194 mp_size_t i;
195 int cnt;
196 mp_limb_t mask;
198 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
200 B1modb = bmodb[2];
201 B2modb = bmodb[3];
203 rl = ap[n - 1];
204 umul_ppmm (ph, pl, rl, B1modb);
205 add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
207 for (i = n - 3; i >= 0; i -= 1)
209 /* rr = ap[i] < B
210 + LO(rr) * (B mod b) <= (B-1)(b-1)
211 + HI(rr) * (B^2 mod b) <= (B-1)(b-1)
213 umul_ppmm (ph, pl, rl, B1modb);
214 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
216 umul_ppmm (rh, rl, rh, B2modb);
217 add_ssaaaa (rh, rl, rh, rl, ph, pl);
220 cnt = bmodb[1];
221 bi = bmodb[0];
223 if (LIKELY (cnt != 0))
224 rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
226 mask = -(mp_limb_t) (rh >= b);
227 rh -= mask & b;
229 udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
231 return r >> cnt;
233 #endif /* MOD_1_1P_METHOD == 1 */
235 #if MOD_1_1P_METHOD == 2
236 void
237 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
239 mp_limb_t bi;
240 mp_limb_t B2modb;
241 int cnt;
243 count_leading_zeros (cnt, b);
245 b <<= cnt;
246 invert_limb (bi, b);
248 cps[0] = bi;
249 cps[1] = cnt;
251 if (LIKELY (cnt != 0))
253 mp_limb_t B1modb = -b;
254 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
255 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
256 cps[2] = B1modb >> cnt;
258 B2modb = - b * bi;
259 ASSERT (B2modb <= b); // NB: equality iff b = B/2
260 cps[3] = B2modb;
263 mp_limb_t
264 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
266 int cnt;
267 mp_limb_t bi, B1modb;
268 mp_limb_t r0, r1;
269 mp_limb_t r;
271 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
273 r0 = ap[n-2];
274 r1 = ap[n-1];
276 if (n > 2)
278 mp_limb_t B2modb, B2mb;
279 mp_limb_t p0, p1;
280 mp_limb_t r2;
281 mp_size_t j;
283 B2modb = bmodb[3];
284 B2mb = B2modb - b;
286 umul_ppmm (p1, p0, r1, B2modb);
287 add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
289 for (j = n-4; j >= 0; j--)
291 mp_limb_t cy;
292 /* mp_limb_t t = r0 + B2mb; */
293 umul_ppmm (p1, p0, r1, B2modb);
295 ADDC_LIMB (cy, r0, r0, r2 & B2modb);
296 /* Alternative, for cmov: if (cy) r0 = t; */
297 r0 -= (-cy) & b;
298 add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
301 r1 -= (r2 & b);
304 cnt = bmodb[1];
306 if (LIKELY (cnt != 0))
308 mp_limb_t t;
309 mp_limb_t B1modb = bmodb[2];
311 umul_ppmm (r1, t, r1, B1modb);
312 r0 += t;
313 r1 += (r0 < t);
315 /* Normalize */
316 r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
317 r0 <<= cnt;
319 /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
321 else
323 mp_limb_t mask = -(mp_limb_t) (r1 >= b);
324 r1 -= mask & b;
327 bi = bmodb[0];
329 udiv_rnnd_preinv (r, r1, r0, b, bi);
330 return r >> cnt;
332 #endif /* MOD_1_1P_METHOD == 2 */