beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / mulmod_bnm1.c
blob67dc2044438806d1600bcd6ffcd6d2685739a908
1 /* mulmod_bnm1.c -- multiplication mod B^n-1.
3 Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
4 Marco Bodrato.
6 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
7 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
27 or both in parallel, as here.
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
43 /* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
44 mod B^rn - 1, and values are semi-normalised; zero is represented
45 as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp.
46 tp==rp is allowed. */
47 void
48 mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
49 mp_ptr tp)
51 mp_limb_t cy;
53 ASSERT (0 < rn);
55 mpn_mul_n (tp, ap, bp, rn);
56 cy = mpn_add_n (rp, tp, tp + rn, rn);
57 /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
58 * be no overflow when adding in the carry. */
59 MPN_INCR_U (rp, rn, cy);
63 /* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
64 semi-normalised representation, computation is mod B^rn + 1. Needs
65 a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
66 Output is normalised. */
67 static void
68 mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
69 mp_ptr tp)
71 mp_limb_t cy;
73 ASSERT (0 < rn);
75 mpn_mul_n (tp, ap, bp, rn + 1);
76 ASSERT (tp[2*rn+1] == 0);
77 ASSERT (tp[2*rn] < GMP_NUMB_MAX);
78 cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
79 rp[rn] = 0;
80 MPN_INCR_U (rp, rn+1, cy );
84 /* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
86 * The result is expected to be ZERO if and only if one of the operand
87 * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
88 * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
89 * combine results and obtain a natural number when one knows in
90 * advance that the final value is less than (B^rn-1).
91 * Moreover it should not be a problem if mulmod_bnm1 is used to
92 * compute the full product with an+bn <= rn, because this condition
93 * implies (B^an-1)(B^bn-1) < (B^rn-1) .
95 * Requires 0 < bn <= an <= rn and an + bn > rn/2
96 * Scratch need: rn + (need for recursive call OR rn + 4). This gives
98 * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
100 void
101 mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
103 ASSERT (0 < bn);
104 ASSERT (bn <= an);
105 ASSERT (an <= rn);
107 if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
109 if (UNLIKELY (bn < rn))
111 if (UNLIKELY (an + bn <= rn))
113 mpn_mul (rp, ap, an, bp, bn);
115 else
117 mp_limb_t cy;
118 mpn_mul (tp, ap, an, bp, bn);
119 cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
120 MPN_INCR_U (rp, rn, cy);
123 else
124 mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
126 else
128 mp_size_t n;
129 mp_limb_t cy;
130 mp_limb_t hi;
132 n = rn >> 1;
134 /* We need at least an + bn >= n, to be able to fit one of the
135 recursive products at rp. Requiring strict inequality makes
136 the code slightly simpler. If desired, we could avoid this
137 restriction by initially halving rn as long as rn is even and
138 an + bn <= rn/2. */
140 ASSERT (an + bn > n);
142 /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
143 and crt together as
145 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
148 #define a0 ap
149 #define a1 (ap + n)
150 #define b0 bp
151 #define b1 (bp + n)
153 #define xp tp /* 2n + 2 */
154 /* am1 maybe in {xp, n} */
155 /* bm1 maybe in {xp + n, n} */
156 #define sp1 (tp + 2*n + 2)
157 /* ap1 maybe in {sp1, n + 1} */
158 /* bp1 maybe in {sp1 + n + 1, n + 1} */
161 mp_srcptr am1, bm1;
162 mp_size_t anm, bnm;
163 mp_ptr so;
165 bm1 = b0;
166 bnm = bn;
167 if (LIKELY (an > n))
169 am1 = xp;
170 cy = mpn_add (xp, a0, n, a1, an - n);
171 MPN_INCR_U (xp, n, cy);
172 anm = n;
173 so = xp + n;
174 if (LIKELY (bn > n))
176 bm1 = so;
177 cy = mpn_add (so, b0, n, b1, bn - n);
178 MPN_INCR_U (so, n, cy);
179 bnm = n;
180 so += n;
183 else
185 so = xp;
186 am1 = a0;
187 anm = an;
190 mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
194 int k;
195 mp_srcptr ap1, bp1;
196 mp_size_t anp, bnp;
198 bp1 = b0;
199 bnp = bn;
200 if (LIKELY (an > n)) {
201 ap1 = sp1;
202 cy = mpn_sub (sp1, a0, n, a1, an - n);
203 sp1[n] = 0;
204 MPN_INCR_U (sp1, n + 1, cy);
205 anp = n + ap1[n];
206 if (LIKELY (bn > n)) {
207 bp1 = sp1 + n + 1;
208 cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
209 sp1[2*n+1] = 0;
210 MPN_INCR_U (sp1 + n + 1, n + 1, cy);
211 bnp = n + bp1[n];
213 } else {
214 ap1 = a0;
215 anp = an;
218 if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
219 k=0;
220 else
222 int mask;
223 k = mpn_fft_best_k (n, 0);
224 mask = (1<<k) - 1;
225 while (n & mask) {k--; mask >>=1;};
227 if (k >= FFT_FIRST_K)
228 xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
229 else if (UNLIKELY (bp1 == b0))
231 ASSERT (anp + bnp <= 2*n+1);
232 ASSERT (anp + bnp > n);
233 ASSERT (anp >= bnp);
234 mpn_mul (xp, ap1, anp, bp1, bnp);
235 anp = anp + bnp - n;
236 ASSERT (anp <= n || xp[2*n]==0);
237 anp-= anp > n;
238 cy = mpn_sub (xp, xp, n, xp + n, anp);
239 xp[n] = 0;
240 MPN_INCR_U (xp, n+1, cy);
242 else
243 mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
246 /* Here the CRT recomposition begins.
248 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
249 Division by 2 is a bitwise rotation.
251 Assumes xp normalised mod (B^n+1).
253 The residue class [0] is represented by [B^n-1]; except when
254 both input are ZERO.
257 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
258 #if HAVE_NATIVE_mpn_rsh1add_nc
259 cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
260 hi = cy << (GMP_NUMB_BITS - 1);
261 cy = 0;
262 /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
263 overflows, i.e. a further increment will not overflow again. */
264 #else /* ! _nc */
265 cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
266 hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
267 cy >>= 1;
268 /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
269 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
270 #endif
271 #if GMP_NAIL_BITS == 0
272 add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
273 #else
274 cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
275 rp[n-1] ^= hi;
276 #endif
277 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
278 #if HAVE_NATIVE_mpn_add_nc
279 cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
280 #else /* ! _nc */
281 cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
282 #endif
283 cy += (rp[0]&1);
284 mpn_rshift(rp, rp, n, 1);
285 ASSERT (cy <= 2);
286 hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
287 cy >>= 1;
288 /* We can have cy != 0 only if hi = 0... */
289 ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
290 rp[n-1] |= hi;
291 /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
292 #endif
293 ASSERT (cy <= 1);
294 /* Next increment can not overflow, read the previous comments about cy. */
295 ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
296 MPN_INCR_U(rp, n, cy);
298 /* Compute the highest half:
299 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
301 if (UNLIKELY (an + bn < rn))
303 /* Note that in this case, the only way the result can equal
304 zero mod B^{rn} - 1 is if one of the inputs is zero, and
305 then the output of both the recursive calls and this CRT
306 reconstruction is zero, not B^{rn} - 1. Which is good,
307 since the latter representation doesn't fit in the output
308 area.*/
309 cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
311 /* FIXME: This subtraction of the high parts is not really
312 necessary, we do it to get the carry out, and for sanity
313 checking. */
314 cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
315 xp + an + bn - n, rn - (an + bn), cy);
316 ASSERT (an + bn == rn - 1 ||
317 mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
318 cy = mpn_sub_1 (rp, rp, an + bn, cy);
319 ASSERT (cy == (xp + an + bn - n)[0]);
321 else
323 cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
324 /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
325 DECR will affect _at most_ the lowest n limbs. */
326 MPN_DECR_U (rp, 2*n, cy);
328 #undef a0
329 #undef a1
330 #undef b0
331 #undef b1
332 #undef xp
333 #undef sp1
337 mp_size_t
338 mpn_mulmod_bnm1_next_size (mp_size_t n)
340 mp_size_t nh;
342 if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD))
343 return n;
344 if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
345 return (n + (2-1)) & (-2);
346 if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
347 return (n + (4-1)) & (-4);
349 nh = (n + 1) >> 1;
351 if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
352 return (n + (8-1)) & (-8);
354 return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));