beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom32_mul.c
blob0b05669cc4e00a35291c9fd83ad2999b5a3fba8f
1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2 times as large as bn. Or more accurately, bn < an < 3bn.
4 Contributed to the GNU project by Torbjorn Granlund.
5 Improvements by Marco Bodrato and Niels Möller.
7 The idea of applying toom to unbalanced multiplication is due to Marco
8 Bodrato and Alberto Zanoni.
10 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
11 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
12 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
14 Copyright 2006-2010 Free Software Foundation, Inc.
16 This file is part of the GNU MP Library.
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
21 * the GNU Lesser General Public License as published by the Free
22 Software Foundation; either version 3 of the License, or (at your
23 option) any later version.
27 * the GNU General Public License as published by the Free Software
28 Foundation; either version 2 of the License, or (at your option) any
29 later version.
31 or both in parallel, as here.
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
36 for more details.
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library. If not,
40 see https://www.gnu.org/licenses/. */
43 #include "gmp.h"
44 #include "gmp-impl.h"
46 /* Evaluate in: -1, 0, +1, +inf
48 <-s-><--n--><--n-->
49 ___ ______ ______
50 |a2_|___a1_|___a0_|
51 |_b1_|___b0_|
52 <-t--><--n-->
54 v0 = a0 * b0 # A(0)*B(0)
55 v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1
56 vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0
57 vinf= a2 * b1 # A(inf)*B(inf)
60 #define TOOM32_MUL_N_REC(p, a, b, n, ws) \
61 do { \
62 mpn_mul_n (p, a, b, n); \
63 } while (0)
65 void
66 mpn_toom32_mul (mp_ptr pp,
67 mp_srcptr ap, mp_size_t an,
68 mp_srcptr bp, mp_size_t bn,
69 mp_ptr scratch)
71 mp_size_t n, s, t;
72 int vm1_neg;
73 mp_limb_t cy;
74 mp_limb_signed_t hi;
75 mp_limb_t ap1_hi, bp1_hi;
77 #define a0 ap
78 #define a1 (ap + n)
79 #define a2 (ap + 2 * n)
80 #define b0 bp
81 #define b1 (bp + n)
83 /* Required, to ensure that s + t >= n. */
84 ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
86 n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
88 s = an - 2 * n;
89 t = bn - n;
91 ASSERT (0 < s && s <= n);
92 ASSERT (0 < t && t <= n);
93 ASSERT (s + t >= n);
95 /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
96 #define ap1 (pp) /* n, most significant limb in ap1_hi */
97 #define bp1 (pp + n) /* n, most significant bit in bp1_hi */
98 #define am1 (pp + 2*n) /* n, most significant bit in hi */
99 #define bm1 (pp + 3*n) /* n */
100 #define v1 (scratch) /* 2n + 1 */
101 #define vm1 (pp) /* 2n + 1 */
102 #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
104 /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
106 /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
108 /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
109 ap1_hi = mpn_add (ap1, a0, n, a2, s);
110 #if HAVE_NATIVE_mpn_add_n_sub_n
111 if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
113 ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
114 hi = 0;
115 vm1_neg = 1;
117 else
119 cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
120 hi = ap1_hi - (cy & 1);
121 ap1_hi += (cy >> 1);
122 vm1_neg = 0;
124 #else
125 if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
127 ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
128 hi = 0;
129 vm1_neg = 1;
131 else
133 hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
134 vm1_neg = 0;
136 ap1_hi += mpn_add_n (ap1, ap1, a1, n);
137 #endif
139 /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
140 if (t == n)
142 #if HAVE_NATIVE_mpn_add_n_sub_n
143 if (mpn_cmp (b0, b1, n) < 0)
145 cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
146 vm1_neg ^= 1;
148 else
150 cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
152 bp1_hi = cy >> 1;
153 #else
154 bp1_hi = mpn_add_n (bp1, b0, b1, n);
156 if (mpn_cmp (b0, b1, n) < 0)
158 ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
159 vm1_neg ^= 1;
161 else
163 ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
165 #endif
167 else
169 /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
170 bp1_hi = mpn_add (bp1, b0, n, b1, t);
172 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
174 ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
175 MPN_ZERO (bm1 + t, n - t);
176 vm1_neg ^= 1;
178 else
180 ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
184 TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
185 if (ap1_hi == 1)
187 cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
189 else if (ap1_hi == 2)
191 #if HAVE_NATIVE_mpn_addlsh1_n
192 cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
193 #else
194 cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
195 #endif
197 else
198 cy = 0;
199 if (bp1_hi != 0)
200 cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
201 v1[2 * n] = cy;
203 TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
204 if (hi)
205 hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
207 vm1[2*n] = hi;
209 /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
210 if (vm1_neg)
212 #if HAVE_NATIVE_mpn_rsh1sub_n
213 mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
214 #else
215 mpn_sub_n (v1, v1, vm1, 2*n+1);
216 ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
217 #endif
219 else
221 #if HAVE_NATIVE_mpn_rsh1add_n
222 mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
223 #else
224 mpn_add_n (v1, v1, vm1, 2*n+1);
225 ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
226 #endif
229 /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
231 y = x1 + x3 + (x0 + x2) * B
232 = (x0 + x2) * B + (x0 + x2) - vm1.
234 y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
235 follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
236 (already in place, except for carry propagation).
238 We thus add
240 B^3 B^2 B 1
241 | | | |
242 +-----+----+
243 + | x0 + x2 |
244 +----+-----+----+
245 + | x0 + x2 |
246 +----------+
247 - | vm1 |
248 --+----++----+----+-
249 | y2 | y1 | y0 |
250 +-----+----+----+
252 Since we store y0 at the same location as the low half of x0 + x2, we
253 need to do the middle sum first. */
255 hi = vm1[2*n];
256 cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
257 MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
259 /* FIXME: Can we get rid of this second vm1_neg conditional by
260 swapping the location of +1 and -1 values? */
261 if (vm1_neg)
263 cy = mpn_add_n (v1, v1, vm1, n);
264 hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
265 MPN_INCR_U (v1 + n, n+1, hi);
267 else
269 cy = mpn_sub_n (v1, v1, vm1, n);
270 hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
271 MPN_DECR_U (v1 + n, n+1, hi);
274 TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
275 /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
276 if (s > t) mpn_mul (pp+3*n, a2, s, b1, t);
277 else mpn_mul (pp+3*n, b1, t, a2, s);
279 /* Remaining interpolation.
281 y * B + x0 + x3 B^3 - x0 B^2 - x3 B
282 = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
283 = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
284 + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
285 = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
286 + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
288 B^4 B^3 B^2 B 1
289 | | | | | |
290 +-------+ +---------+---------+
291 | Hx3 | | Hx0-Lx3 | Lx0 |
292 +------+----------+---------+---------+---------+
293 | y2 | y1 | y0 |
294 ++---------+---------+---------+
295 -| Hx0-Lx3 | - Lx0 |
296 +---------+---------+
297 | - Hx3 |
298 +--------+
300 We must take into account the carry from Hx0 - Lx3.
303 cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
304 hi = scratch[2*n] + cy;
306 cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
307 hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
309 hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
311 /* FIXME: Is support for s + t == n needed? */
312 if (LIKELY (s + t > n))
314 hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
316 if (hi < 0)
317 MPN_DECR_U (pp + 4*n, s+t-n, -hi);
318 else
319 MPN_INCR_U (pp + 4*n, s+t-n, hi);
321 else
322 ASSERT (hi == 0);