1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2 times as large as bn. Or more accurately, bn < an < 3bn.
4 Contributed to the GNU project by Torbjorn Granlund.
5 Improvements by Marco Bodrato and Niels Möller.
7 The idea of applying toom to unbalanced multiplication is due to Marco
8 Bodrato and Alberto Zanoni.
10 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
11 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
12 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
14 Copyright 2006-2010 Free Software Foundation, Inc.
16 This file is part of the GNU MP Library.
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
21 * the GNU Lesser General Public License as published by the Free
22 Software Foundation; either version 3 of the License, or (at your
23 option) any later version.
27 * the GNU General Public License as published by the Free Software
28 Foundation; either version 2 of the License, or (at your option) any
31 or both in parallel, as here.
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library. If not,
40 see https://www.gnu.org/licenses/. */
46 /* Evaluate in: -1, 0, +1, +inf
54 v0 = a0 * b0 # A(0)*B(0)
55 v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1
56 vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0
57 vinf= a2 * b1 # A(inf)*B(inf)
60 #define TOOM32_MUL_N_REC(p, a, b, n, ws) \
62 mpn_mul_n (p, a, b, n); \
66 mpn_toom32_mul (mp_ptr pp
,
67 mp_srcptr ap
, mp_size_t an
,
68 mp_srcptr bp
, mp_size_t bn
,
75 mp_limb_t ap1_hi
, bp1_hi
;
79 #define a2 (ap + 2 * n)
83 /* Required, to ensure that s + t >= n. */
84 ASSERT (bn
+ 2 <= an
&& an
+ 6 <= 3*bn
);
86 n
= 1 + (2 * an
>= 3 * bn
? (an
- 1) / (size_t) 3 : (bn
- 1) >> 1);
91 ASSERT (0 < s
&& s
<= n
);
92 ASSERT (0 < t
&& t
<= n
);
95 /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
96 #define ap1 (pp) /* n, most significant limb in ap1_hi */
97 #define bp1 (pp + n) /* n, most significant bit in bp1_hi */
98 #define am1 (pp + 2*n) /* n, most significant bit in hi */
99 #define bm1 (pp + 3*n) /* n */
100 #define v1 (scratch) /* 2n + 1 */
101 #define vm1 (pp) /* 2n + 1 */
102 #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
104 /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
106 /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
108 /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
109 ap1_hi
= mpn_add (ap1
, a0
, n
, a2
, s
);
110 #if HAVE_NATIVE_mpn_add_n_sub_n
111 if (ap1_hi
== 0 && mpn_cmp (ap1
, a1
, n
) < 0)
113 ap1_hi
= mpn_add_n_sub_n (ap1
, am1
, a1
, ap1
, n
) >> 1;
119 cy
= mpn_add_n_sub_n (ap1
, am1
, ap1
, a1
, n
);
120 hi
= ap1_hi
- (cy
& 1);
125 if (ap1_hi
== 0 && mpn_cmp (ap1
, a1
, n
) < 0)
127 ASSERT_NOCARRY (mpn_sub_n (am1
, a1
, ap1
, n
));
133 hi
= ap1_hi
- mpn_sub_n (am1
, ap1
, a1
, n
);
136 ap1_hi
+= mpn_add_n (ap1
, ap1
, a1
, n
);
139 /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
142 #if HAVE_NATIVE_mpn_add_n_sub_n
143 if (mpn_cmp (b0
, b1
, n
) < 0)
145 cy
= mpn_add_n_sub_n (bp1
, bm1
, b1
, b0
, n
);
150 cy
= mpn_add_n_sub_n (bp1
, bm1
, b0
, b1
, n
);
154 bp1_hi
= mpn_add_n (bp1
, b0
, b1
, n
);
156 if (mpn_cmp (b0
, b1
, n
) < 0)
158 ASSERT_NOCARRY (mpn_sub_n (bm1
, b1
, b0
, n
));
163 ASSERT_NOCARRY (mpn_sub_n (bm1
, b0
, b1
, n
));
169 /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
170 bp1_hi
= mpn_add (bp1
, b0
, n
, b1
, t
);
172 if (mpn_zero_p (b0
+ t
, n
- t
) && mpn_cmp (b0
, b1
, t
) < 0)
174 ASSERT_NOCARRY (mpn_sub_n (bm1
, b1
, b0
, t
));
175 MPN_ZERO (bm1
+ t
, n
- t
);
180 ASSERT_NOCARRY (mpn_sub (bm1
, b0
, n
, b1
, t
));
184 TOOM32_MUL_N_REC (v1
, ap1
, bp1
, n
, scratch_out
);
187 cy
= bp1_hi
+ mpn_add_n (v1
+ n
, v1
+ n
, bp1
, n
);
189 else if (ap1_hi
== 2)
191 #if HAVE_NATIVE_mpn_addlsh1_n
192 cy
= 2 * bp1_hi
+ mpn_addlsh1_n (v1
+ n
, v1
+ n
, bp1
, n
);
194 cy
= 2 * bp1_hi
+ mpn_addmul_1 (v1
+ n
, bp1
, n
, CNST_LIMB(2));
200 cy
+= mpn_add_n (v1
+ n
, v1
+ n
, ap1
, n
);
203 TOOM32_MUL_N_REC (vm1
, am1
, bm1
, n
, scratch_out
);
205 hi
= mpn_add_n (vm1
+n
, vm1
+n
, bm1
, n
);
209 /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
212 #if HAVE_NATIVE_mpn_rsh1sub_n
213 mpn_rsh1sub_n (v1
, v1
, vm1
, 2*n
+1);
215 mpn_sub_n (v1
, v1
, vm1
, 2*n
+1);
216 ASSERT_NOCARRY (mpn_rshift (v1
, v1
, 2*n
+1, 1));
221 #if HAVE_NATIVE_mpn_rsh1add_n
222 mpn_rsh1add_n (v1
, v1
, vm1
, 2*n
+1);
224 mpn_add_n (v1
, v1
, vm1
, 2*n
+1);
225 ASSERT_NOCARRY (mpn_rshift (v1
, v1
, 2*n
+1, 1));
229 /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
231 y = x1 + x3 + (x0 + x2) * B
232 = (x0 + x2) * B + (x0 + x2) - vm1.
234 y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
235 follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
236 (already in place, except for carry propagation).
252 Since we store y0 at the same location as the low half of x0 + x2, we
253 need to do the middle sum first. */
256 cy
= mpn_add_n (pp
+ 2*n
, v1
, v1
+ n
, n
);
257 MPN_INCR_U (v1
+ n
, n
+ 1, cy
+ v1
[2*n
]);
259 /* FIXME: Can we get rid of this second vm1_neg conditional by
260 swapping the location of +1 and -1 values? */
263 cy
= mpn_add_n (v1
, v1
, vm1
, n
);
264 hi
+= mpn_add_nc (pp
+ 2*n
, pp
+ 2*n
, vm1
+ n
, n
, cy
);
265 MPN_INCR_U (v1
+ n
, n
+1, hi
);
269 cy
= mpn_sub_n (v1
, v1
, vm1
, n
);
270 hi
+= mpn_sub_nc (pp
+ 2*n
, pp
+ 2*n
, vm1
+ n
, n
, cy
);
271 MPN_DECR_U (v1
+ n
, n
+1, hi
);
274 TOOM32_MUL_N_REC (pp
, a0
, b0
, n
, scratch_out
);
275 /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */
276 if (s
> t
) mpn_mul (pp
+3*n
, a2
, s
, b1
, t
);
277 else mpn_mul (pp
+3*n
, b1
, t
, a2
, s
);
279 /* Remaining interpolation.
281 y * B + x0 + x3 B^3 - x0 B^2 - x3 B
282 = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
283 = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
284 + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
285 = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
286 + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
290 +-------+ +---------+---------+
291 | Hx3 | | Hx0-Lx3 | Lx0 |
292 +------+----------+---------+---------+---------+
294 ++---------+---------+---------+
296 +---------+---------+
300 We must take into account the carry from Hx0 - Lx3.
303 cy
= mpn_sub_n (pp
+ n
, pp
+ n
, pp
+3*n
, n
);
304 hi
= scratch
[2*n
] + cy
;
306 cy
= mpn_sub_nc (pp
+ 2*n
, pp
+ 2*n
, pp
, n
, cy
);
307 hi
-= mpn_sub_nc (pp
+ 3*n
, scratch
+ n
, pp
+ n
, n
, cy
);
309 hi
+= mpn_add (pp
+ n
, pp
+ n
, 3*n
, scratch
, n
);
311 /* FIXME: Is support for s + t == n needed? */
312 if (LIKELY (s
+ t
> n
))
314 hi
-= mpn_sub (pp
+ 2*n
, pp
+ 2*n
, 2*n
, pp
+ 4*n
, s
+t
-n
);
317 MPN_DECR_U (pp
+ 4*n
, s
+t
-n
, -hi
);
319 MPN_INCR_U (pp
+ 4*n
, s
+t
-n
, hi
);