beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom33_mul.c
blob655355c39a35267e473b83326d7e709c9122b763
1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (3/2)bn.
4 Contributed to the GNU project by Torbjorn Granlund.
5 Additional improvements by Marco Bodrato.
7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
11 Copyright 2006-2008, 2010, 2012 Free Software Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
28 or both in parallel, as here.
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
40 #include "gmp.h"
41 #include "gmp-impl.h"
43 /* Evaluate in: -1, 0, +1, +2, +inf
45 <-s--><--n--><--n-->
46 ____ ______ ______
47 |_a2_|___a1_|___a0_|
48 |b2_|___b1_|___b0_|
49 <-t-><--n--><--n-->
51 v0 = a0 * b0 # A(0)*B(0)
52 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
53 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
54 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
55 vinf= a2 * b2 # A(inf)*B(inf)
58 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
59 #define MAYBE_mul_basecase 1
60 #define MAYBE_mul_toom33 1
61 #else
62 #define MAYBE_mul_basecase \
63 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
64 #define MAYBE_mul_toom33 \
65 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
66 #endif
68 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
69 multiplication at the infinity point. We may have
70 MAYBE_mul_basecase == 0, and still get s just below
71 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
72 s == 1 and mpn_toom22_mul will crash.
75 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
76 do { \
77 if (MAYBE_mul_basecase \
78 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
79 mpn_mul_basecase (p, a, n, b, n); \
80 else if (! MAYBE_mul_toom33 \
81 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
82 mpn_toom22_mul (p, a, n, b, n, ws); \
83 else \
84 mpn_toom33_mul (p, a, n, b, n, ws); \
85 } while (0)
87 void
88 mpn_toom33_mul (mp_ptr pp,
89 mp_srcptr ap, mp_size_t an,
90 mp_srcptr bp, mp_size_t bn,
91 mp_ptr scratch)
93 const int __gmpn_cpuvec_initialized = 1;
94 mp_size_t n, s, t;
95 int vm1_neg;
96 mp_limb_t cy, vinf0;
97 mp_ptr gp;
98 mp_ptr as1, asm1, as2;
99 mp_ptr bs1, bsm1, bs2;
101 #define a0 ap
102 #define a1 (ap + n)
103 #define a2 (ap + 2*n)
104 #define b0 bp
105 #define b1 (bp + n)
106 #define b2 (bp + 2*n)
108 n = (an + 2) / (size_t) 3;
110 s = an - 2 * n;
111 t = bn - 2 * n;
113 ASSERT (an >= bn);
115 ASSERT (0 < s && s <= n);
116 ASSERT (0 < t && t <= n);
118 as1 = scratch + 4 * n + 4;
119 asm1 = scratch + 2 * n + 2;
120 as2 = pp + n + 1;
122 bs1 = pp;
123 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
124 bs2 = pp + 2 * n + 2;
126 gp = scratch;
128 vm1_neg = 0;
130 /* Compute as1 and asm1. */
131 cy = mpn_add (gp, a0, n, a2, s);
132 #if HAVE_NATIVE_mpn_add_n_sub_n
133 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
135 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
136 as1[n] = cy >> 1;
137 asm1[n] = 0;
138 vm1_neg = 1;
140 else
142 mp_limb_t cy2;
143 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
144 as1[n] = cy + (cy2 >> 1);
145 asm1[n] = cy - (cy2 & 1);
147 #else
148 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
149 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
151 mpn_sub_n (asm1, a1, gp, n);
152 asm1[n] = 0;
153 vm1_neg = 1;
155 else
157 cy -= mpn_sub_n (asm1, gp, a1, n);
158 asm1[n] = cy;
160 #endif
162 /* Compute as2. */
163 #if HAVE_NATIVE_mpn_rsblsh1_n
164 cy = mpn_add_n (as2, a2, as1, s);
165 if (s != n)
166 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
167 cy += as1[n];
168 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
169 #else
170 #if HAVE_NATIVE_mpn_addlsh1_n
171 cy = mpn_addlsh1_n (as2, a1, a2, s);
172 if (s != n)
173 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
174 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
175 #else
176 cy = mpn_add_n (as2, a2, as1, s);
177 if (s != n)
178 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
179 cy += as1[n];
180 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
181 cy -= mpn_sub_n (as2, as2, a0, n);
182 #endif
183 #endif
184 as2[n] = cy;
186 /* Compute bs1 and bsm1. */
187 cy = mpn_add (gp, b0, n, b2, t);
188 #if HAVE_NATIVE_mpn_add_n_sub_n
189 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
191 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
192 bs1[n] = cy >> 1;
193 bsm1[n] = 0;
194 vm1_neg ^= 1;
196 else
198 mp_limb_t cy2;
199 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
200 bs1[n] = cy + (cy2 >> 1);
201 bsm1[n] = cy - (cy2 & 1);
203 #else
204 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
205 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
207 mpn_sub_n (bsm1, b1, gp, n);
208 bsm1[n] = 0;
209 vm1_neg ^= 1;
211 else
213 cy -= mpn_sub_n (bsm1, gp, b1, n);
214 bsm1[n] = cy;
216 #endif
218 /* Compute bs2. */
219 #if HAVE_NATIVE_mpn_rsblsh1_n
220 cy = mpn_add_n (bs2, b2, bs1, t);
221 if (t != n)
222 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
223 cy += bs1[n];
224 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
225 #else
226 #if HAVE_NATIVE_mpn_addlsh1_n
227 cy = mpn_addlsh1_n (bs2, b1, b2, t);
228 if (t != n)
229 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
230 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
231 #else
232 cy = mpn_add_n (bs2, bs1, b2, t);
233 if (t != n)
234 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
235 cy += bs1[n];
236 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
237 cy -= mpn_sub_n (bs2, bs2, b0, n);
238 #endif
239 #endif
240 bs2[n] = cy;
242 ASSERT (as1[n] <= 2);
243 ASSERT (bs1[n] <= 2);
244 ASSERT (asm1[n] <= 1);
245 ASSERT (bsm1[n] <= 1);
246 ASSERT (as2[n] <= 6);
247 ASSERT (bs2[n] <= 6);
249 #define v0 pp /* 2n */
250 #define v1 (pp + 2 * n) /* 2n+1 */
251 #define vinf (pp + 4 * n) /* s+t */
252 #define vm1 scratch /* 2n+1 */
253 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
254 #define scratch_out (scratch + 5 * n + 5)
256 /* vm1, 2n+1 limbs */
257 #ifdef SMALLER_RECURSION
258 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
259 cy = 0;
260 if (asm1[n] != 0)
261 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
262 if (bsm1[n] != 0)
263 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
264 vm1[2 * n] = cy;
265 #else
266 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
267 #endif
269 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
271 /* vinf, s+t limbs */
272 if (s > t) mpn_mul (vinf, a2, s, b2, t);
273 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
275 vinf0 = vinf[0]; /* v1 overlaps with this */
277 #ifdef SMALLER_RECURSION
278 /* v1, 2n+1 limbs */
279 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
280 if (as1[n] == 1)
282 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
284 else if (as1[n] != 0)
286 #if HAVE_NATIVE_mpn_addlsh1_n
287 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
288 #else
289 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
290 #endif
292 else
293 cy = 0;
294 if (bs1[n] == 1)
296 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
298 else if (bs1[n] != 0)
300 #if HAVE_NATIVE_mpn_addlsh1_n
301 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
302 #else
303 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
304 #endif
306 v1[2 * n] = cy;
307 #else
308 cy = vinf[1];
309 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
310 vinf[1] = cy;
311 #endif
313 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
315 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);