beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom62_mul.c
blobafc6a1002a5a70c4334c0d147b56b5f138b8dc8d
1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
2 as large as bn. Or more accurately, (5/2)bn < an < 6bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 The idea of applying toom to unbalanced multiplication is due to Marco
7 Bodrato and Alberto Zanoni.
9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 Copyright 2006-2008, 2012 Free Software Foundation, Inc.
15 This file is part of the GNU MP Library.
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of either:
20 * the GNU Lesser General Public License as published by the Free
21 Software Foundation; either version 3 of the License, or (at your
22 option) any later version.
26 * the GNU General Public License as published by the Free Software
27 Foundation; either version 2 of the License, or (at your option) any
28 later version.
30 or both in parallel, as here.
32 The GNU MP Library is distributed in the hope that it will be useful, but
33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
35 for more details.
37 You should have received copies of the GNU General Public License and the
38 GNU Lesser General Public License along with the GNU MP Library. If not,
39 see https://www.gnu.org/licenses/. */
42 #include "gmp.h"
43 #include "gmp-impl.h"
45 /* Evaluate in:
46 0, +1, -1, +2, -2, 1/2, +inf
48 <-s-><--n--><--n--><--n--><--n--><--n-->
49 ___ ______ ______ ______ ______ ______
50 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
51 |_b1_|___b0_|
52 <-t--><--n-->
54 v0 = a0 * b0 # A(0)*B(0)
55 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1
56 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0
57 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2
58 vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0
59 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2
60 vinf= a5 * b1 # A(inf)*B(inf)
63 void
64 mpn_toom62_mul (mp_ptr pp,
65 mp_srcptr ap, mp_size_t an,
66 mp_srcptr bp, mp_size_t bn,
67 mp_ptr scratch)
69 mp_size_t n, s, t;
70 mp_limb_t cy;
71 mp_ptr as1, asm1, as2, asm2, ash;
72 mp_ptr bs1, bsm1, bs2, bsm2, bsh;
73 mp_ptr gp;
74 enum toom7_flags aflags, bflags;
75 TMP_DECL;
77 #define a0 ap
78 #define a1 (ap + n)
79 #define a2 (ap + 2*n)
80 #define a3 (ap + 3*n)
81 #define a4 (ap + 4*n)
82 #define a5 (ap + 5*n)
83 #define b0 bp
84 #define b1 (bp + n)
86 n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
88 s = an - 5 * n;
89 t = bn - n;
91 ASSERT (0 < s && s <= n);
92 ASSERT (0 < t && t <= n);
94 TMP_MARK;
96 as1 = TMP_SALLOC_LIMBS (n + 1);
97 asm1 = TMP_SALLOC_LIMBS (n + 1);
98 as2 = TMP_SALLOC_LIMBS (n + 1);
99 asm2 = TMP_SALLOC_LIMBS (n + 1);
100 ash = TMP_SALLOC_LIMBS (n + 1);
102 bs1 = TMP_SALLOC_LIMBS (n + 1);
103 bsm1 = TMP_SALLOC_LIMBS (n);
104 bs2 = TMP_SALLOC_LIMBS (n + 1);
105 bsm2 = TMP_SALLOC_LIMBS (n + 1);
106 bsh = TMP_SALLOC_LIMBS (n + 1);
108 gp = pp;
110 /* Compute as1 and asm1. */
111 aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
113 /* Compute as2 and asm2. */
114 aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)));
116 /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
117 = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */
119 #if HAVE_NATIVE_mpn_addlsh1_n
120 cy = mpn_addlsh1_n (ash, a1, a0, n);
121 cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
122 cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
123 cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
124 if (s < n)
126 mp_limb_t cy2;
127 cy2 = mpn_addlsh1_n (ash, a5, ash, s);
128 ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
129 MPN_INCR_U (ash + s, n+1-s, cy2);
131 else
132 ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
133 #else
134 cy = mpn_lshift (ash, a0, n, 1);
135 cy += mpn_add_n (ash, ash, a1, n);
136 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
137 cy += mpn_add_n (ash, ash, a2, n);
138 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
139 cy += mpn_add_n (ash, ash, a3, n);
140 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
141 cy += mpn_add_n (ash, ash, a4, n);
142 cy = 2*cy + mpn_lshift (ash, ash, n, 1);
143 ash[n] = cy + mpn_add (ash, ash, n, a5, s);
144 #endif
146 /* Compute bs1 and bsm1. */
147 if (t == n)
149 #if HAVE_NATIVE_mpn_add_n_sub_n
150 if (mpn_cmp (b0, b1, n) < 0)
152 cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
153 bflags = toom7_w3_neg;
155 else
157 cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
158 bflags = (enum toom7_flags) 0;
160 bs1[n] = cy >> 1;
161 #else
162 bs1[n] = mpn_add_n (bs1, b0, b1, n);
163 if (mpn_cmp (b0, b1, n) < 0)
165 mpn_sub_n (bsm1, b1, b0, n);
166 bflags = toom7_w3_neg;
168 else
170 mpn_sub_n (bsm1, b0, b1, n);
171 bflags = (enum toom7_flags) 0;
173 #endif
175 else
177 bs1[n] = mpn_add (bs1, b0, n, b1, t);
178 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
180 mpn_sub_n (bsm1, b1, b0, t);
181 MPN_ZERO (bsm1 + t, n - t);
182 bflags = toom7_w3_neg;
184 else
186 mpn_sub (bsm1, b0, n, b1, t);
187 bflags = (enum toom7_flags) 0;
191 /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
192 bsm1 - b1 */
193 mpn_add (bs2, bs1, n + 1, b1, t);
194 if (bflags & toom7_w3_neg)
196 bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
197 bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
199 else
201 /* FIXME: Simplify this logic? */
202 if (t < n)
204 if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
206 ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
207 MPN_ZERO (bsm2 + t, n + 1 - t);
208 bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
210 else
212 ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
213 bsm2[n] = 0;
216 else
218 if (mpn_cmp (bsm1, b1, n) < 0)
220 ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
221 bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
223 else
225 ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
227 bsm2[n] = 0;
231 /* Compute bsh, recycling bs1. bsh=bs1+b0; */
232 bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
234 ASSERT (as1[n] <= 5);
235 ASSERT (bs1[n] <= 1);
236 ASSERT (asm1[n] <= 2);
237 ASSERT (as2[n] <= 62);
238 ASSERT (bs2[n] <= 2);
239 ASSERT (asm2[n] <= 41);
240 ASSERT (bsm2[n] <= 1);
241 ASSERT (ash[n] <= 62);
242 ASSERT (bsh[n] <= 2);
244 #define v0 pp /* 2n */
245 #define v1 (pp + 2 * n) /* 2n+1 */
246 #define vinf (pp + 6 * n) /* s+t */
247 #define v2 scratch /* 2n+1 */
248 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
249 #define vh (scratch + 4 * n + 2) /* 2n+1 */
250 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
251 #define scratch_out (scratch + 8 * n + 4) /* 2n+1 */
252 /* Total scratch need: 10*n+5 */
254 /* Must be in allocation order, as they overwrite one limb beyond
255 * 2n+1. */
256 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */
257 mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */
258 mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */
260 /* vm1, 2n+1 limbs */
261 mpn_mul_n (vm1, asm1, bsm1, n);
262 cy = 0;
263 if (asm1[n] == 1)
265 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
267 else if (asm1[n] == 2)
269 #if HAVE_NATIVE_mpn_addlsh1_n
270 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
271 #else
272 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
273 #endif
275 vm1[2 * n] = cy;
277 /* v1, 2n+1 limbs */
278 mpn_mul_n (v1, as1, bs1, n);
279 if (as1[n] == 1)
281 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
283 else if (as1[n] == 2)
285 #if HAVE_NATIVE_mpn_addlsh1_n
286 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
287 #else
288 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289 #endif
291 else if (as1[n] != 0)
293 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
295 else
296 cy = 0;
297 if (bs1[n] != 0)
298 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
299 v1[2 * n] = cy;
301 mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */
303 /* vinf, s+t limbs */
304 if (s > t) mpn_mul (vinf, a5, s, b1, t);
305 else mpn_mul (vinf, b1, t, a5, s);
307 mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
308 vm2, vm1, v2, vh, s + t, scratch_out);
310 TMP_FREE;