beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom53_mul.c
blobe7c287089b1071999eeaade500b2c9680e85ae24
1 /* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
2 times as large as bn. Or more accurately, (4/3)bn < an < (5/2)bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
15 This file is part of the GNU MP Library.
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of either:
20 * the GNU Lesser General Public License as published by the Free
21 Software Foundation; either version 3 of the License, or (at your
22 option) any later version.
26 * the GNU General Public License as published by the Free Software
27 Foundation; either version 2 of the License, or (at your option) any
28 later version.
30 or both in parallel, as here.
32 The GNU MP Library is distributed in the hope that it will be useful, but
33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 for more details.
37 You should have received copies of the GNU General Public License and the
38 GNU Lesser General Public License along with the GNU MP Library. If not,
39 see https://www.gnu.org/licenses/. */
42 #include "gmp.h"
43 #include "gmp-impl.h"
45 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
47 <-s-><--n--><--n--><--n--><--n-->
48 ___ ______ ______ ______ ______
49 |a4_|___a3_|___a2_|___a1_|___a0_|
50 |__b2|___b1_|___b0_|
51 <-t--><--n--><--n-->
53 v0 = a0 * b0 # A(0)*B(0)
54 v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2
55 vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1
56 v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6
57 vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4
58 vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6
59 vinf= a4 * b2 # A(inf)*B(inf)
62 void
63 mpn_toom53_mul (mp_ptr pp,
64 mp_srcptr ap, mp_size_t an,
65 mp_srcptr bp, mp_size_t bn,
66 mp_ptr scratch)
68 mp_size_t n, s, t;
69 mp_limb_t cy;
70 mp_ptr gp;
71 mp_ptr as1, asm1, as2, asm2, ash;
72 mp_ptr bs1, bsm1, bs2, bsm2, bsh;
73 mp_ptr tmp;
74 enum toom7_flags flags;
75 TMP_DECL;
77 #define a0 ap
78 #define a1 (ap + n)
79 #define a2 (ap + 2*n)
80 #define a3 (ap + 3*n)
81 #define a4 (ap + 4*n)
82 #define b0 bp
83 #define b1 (bp + n)
84 #define b2 (bp + 2*n)
86 n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
88 s = an - 4 * n;
89 t = bn - 2 * n;
91 ASSERT (0 < s && s <= n);
92 ASSERT (0 < t && t <= n);
94 TMP_MARK;
96 tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
97 as1 = tmp; tmp += n + 1;
98 asm1 = tmp; tmp += n + 1;
99 as2 = tmp; tmp += n + 1;
100 asm2 = tmp; tmp += n + 1;
101 ash = tmp; tmp += n + 1;
102 bs1 = tmp; tmp += n + 1;
103 bsm1 = tmp; tmp += n + 1;
104 bs2 = tmp; tmp += n + 1;
105 bsm2 = tmp; tmp += n + 1;
106 bsh = tmp; tmp += n + 1;
108 gp = pp;
110 /* Compute as1 and asm1. */
111 flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
113 /* Compute as2 and asm2. */
114 flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)));
116 /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
117 = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */
118 #if HAVE_NATIVE_mpn_addlsh1_n
119 cy = mpn_addlsh1_n (ash, a1, a0, n);
120 cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
121 cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
122 if (s < n)
124 mp_limb_t cy2;
125 cy2 = mpn_addlsh1_n (ash, a4, ash, s);
126 ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
127 MPN_INCR_U (ash + s, n+1-s, cy2);
129 else
130 ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
131 #else
132 cy = mpn_lshift (ash, a0, n, 1);
133 cy += mpn_add_n (ash, ash, a1, n);
134 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
135 cy += mpn_add_n (ash, ash, a2, n);
136 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
137 cy += mpn_add_n (ash, ash, a3, n);
138 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
139 ash[n] = cy + mpn_add (ash, ash, n, a4, s);
140 #endif
142 /* Compute bs1 and bsm1. */
143 bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */
144 #if HAVE_NATIVE_mpn_add_n_sub_n
145 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
147 bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
148 bsm1[n] = 0;
149 flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
151 else
153 cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
154 bsm1[n] = bs1[n] - (cy & 1);
155 bs1[n] += (cy >> 1);
157 #else
158 if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
160 mpn_sub_n (bsm1, b1, bs1, n);
161 bsm1[n] = 0;
162 flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
164 else
166 bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
168 bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */
169 #endif
171 /* Compute bs2 and bsm2. */
172 #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
173 #if HAVE_NATIVE_mpn_addlsh2_n
174 cy = mpn_addlsh2_n (bs2, b0, b2, t);
175 #else /* HAVE_NATIVE_mpn_addlsh_n */
176 cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
177 #endif
178 if (t < n)
179 cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
180 bs2[n] = cy;
181 #else
182 cy = mpn_lshift (gp, b2, t, 2);
183 bs2[n] = mpn_add (bs2, b0, n, gp, t);
184 MPN_INCR_U (bs2 + t, n+1-t, cy);
185 #endif
187 gp[n] = mpn_lshift (gp, b1, n, 1);
189 #if HAVE_NATIVE_mpn_add_n_sub_n
190 if (mpn_cmp (bs2, gp, n+1) < 0)
192 ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
193 flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
195 else
197 ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
199 #else
200 if (mpn_cmp (bs2, gp, n+1) < 0)
202 ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
203 flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
205 else
207 ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
209 mpn_add_n (bs2, bs2, gp, n+1);
210 #endif
212 /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2. */
213 #if HAVE_NATIVE_mpn_addlsh1_n
214 cy = mpn_addlsh1_n (bsh, b1, b0, n);
215 if (t < n)
217 mp_limb_t cy2;
218 cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
219 bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
220 MPN_INCR_U (bsh + t, n+1-t, cy2);
222 else
223 bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
224 #else
225 cy = mpn_lshift (bsh, b0, n, 1);
226 cy += mpn_add_n (bsh, bsh, b1, n);
227 cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
228 bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
229 #endif
231 ASSERT (as1[n] <= 4);
232 ASSERT (bs1[n] <= 2);
233 ASSERT (asm1[n] <= 2);
234 ASSERT (bsm1[n] <= 1);
235 ASSERT (as2[n] <= 30);
236 ASSERT (bs2[n] <= 6);
237 ASSERT (asm2[n] <= 20);
238 ASSERT (bsm2[n] <= 4);
239 ASSERT (ash[n] <= 30);
240 ASSERT (bsh[n] <= 6);
242 #define v0 pp /* 2n */
243 #define v1 (pp + 2 * n) /* 2n+1 */
244 #define vinf (pp + 6 * n) /* s+t */
245 #define v2 scratch /* 2n+1 */
246 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
247 #define vh (scratch + 4 * n + 2) /* 2n+1 */
248 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
249 #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
250 /* Total scratch need: 10*n+5 */
252 /* Must be in allocation order, as they overwrite one limb beyond
253 * 2n+1. */
254 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
255 mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
256 mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
258 /* vm1, 2n+1 limbs */
259 #ifdef SMALLER_RECURSION
260 mpn_mul_n (vm1, asm1, bsm1, n);
261 if (asm1[n] == 1)
263 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
265 else if (asm1[n] == 2)
267 #if HAVE_NATIVE_mpn_addlsh1_n
268 cy = 2 * bsm1[n] + mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
269 #else
270 cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
271 #endif
273 else
274 cy = 0;
275 if (bsm1[n] != 0)
276 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
277 vm1[2 * n] = cy;
278 #else /* SMALLER_RECURSION */
279 vm1[2 * n] = 0;
280 mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
281 #endif /* SMALLER_RECURSION */
283 /* v1, 2n+1 limbs */
284 #ifdef SMALLER_RECURSION
285 mpn_mul_n (v1, as1, bs1, n);
286 if (as1[n] == 1)
288 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
290 else if (as1[n] == 2)
292 #if HAVE_NATIVE_mpn_addlsh1_n
293 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
294 #else
295 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
296 #endif
298 else if (as1[n] != 0)
300 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
302 else
303 cy = 0;
304 if (bs1[n] == 1)
306 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
308 else if (bs1[n] == 2)
310 #if HAVE_NATIVE_mpn_addlsh1_n
311 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
312 #else
313 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
314 #endif
316 v1[2 * n] = cy;
317 #else /* SMALLER_RECURSION */
318 v1[2 * n] = 0;
319 mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
320 #endif /* SMALLER_RECURSION */
322 mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
324 /* vinf, s+t limbs */
325 if (s > t) mpn_mul (vinf, a4, s, b2, t);
326 else mpn_mul (vinf, b2, t, a4, s);
328 mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
329 scratch_out);
331 TMP_FREE;