beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom44_mul.c
blobfbde633c30043ce521a285c7bccece497b37c94a
1 /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (4/3)bn.
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2006-2008, 2013 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
27 or both in parallel, as here.
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
39 #include "gmp.h"
40 #include "gmp-impl.h"
42 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
44 <-s--><--n--><--n--><--n-->
45 ____ ______ ______ ______
46 |_a3_|___a2_|___a1_|___a0_|
47 |b3_|___b2_|___b1_|___b0_|
48 <-t-><--n--><--n--><--n-->
50 v0 = a0 * b0 # A(0)*B(0)
51 v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3
52 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1
53 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14
54 vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9
55 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14
56 vinf= a3 * b2 # A(inf)*B(inf)
59 #if TUNE_PROGRAM_BUILD
60 #define MAYBE_mul_basecase 1
61 #define MAYBE_mul_toom22 1
62 #define MAYBE_mul_toom44 1
63 #else
64 #define MAYBE_mul_basecase \
65 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
66 #define MAYBE_mul_toom22 \
67 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
68 #define MAYBE_mul_toom44 \
69 (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
70 #endif
72 #define TOOM44_MUL_N_REC(p, a, b, n, ws) \
73 do { \
74 if (MAYBE_mul_basecase \
75 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
76 mpn_mul_basecase (p, a, n, b, n); \
77 else if (MAYBE_mul_toom22 \
78 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
79 mpn_toom22_mul (p, a, n, b, n, ws); \
80 else if (! MAYBE_mul_toom44 \
81 || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \
82 mpn_toom33_mul (p, a, n, b, n, ws); \
83 else \
84 mpn_toom44_mul (p, a, n, b, n, ws); \
85 } while (0)
87 /* Use of scratch space. In the product area, we store
89 ___________________
90 |vinf|____|_v1_|_v0_|
91 s+t 2n-1 2n+1 2n
93 The other recursive products, vm1, v2, vm2, vh are stored in the
94 scratch area. When computing them, we use the product area for
95 intermediate values.
97 Next, we compute v1. We can store the intermediate factors at v0
98 and at vh + 2n + 2.
100 Finally, for v0 and vinf, factors are parts of the input operands,
101 and we need scratch space only for the recursive multiplication.
103 In all, if S(an) is the scratch need, the needed space is bounded by
105 S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
107 which should give S(n) = 8 n/3 + c log(n) for some constant c.
110 void
111 mpn_toom44_mul (mp_ptr pp,
112 mp_srcptr ap, mp_size_t an,
113 mp_srcptr bp, mp_size_t bn,
114 mp_ptr scratch)
116 mp_size_t n, s, t;
117 mp_limb_t cy;
118 enum toom7_flags flags;
120 #define a0 ap
121 #define a1 (ap + n)
122 #define a2 (ap + 2*n)
123 #define a3 (ap + 3*n)
124 #define b0 bp
125 #define b1 (bp + n)
126 #define b2 (bp + 2*n)
127 #define b3 (bp + 3*n)
129 ASSERT (an >= bn);
131 n = (an + 3) >> 2;
133 s = an - 3 * n;
134 t = bn - 3 * n;
136 ASSERT (0 < s && s <= n);
137 ASSERT (0 < t && t <= n);
138 ASSERT (s >= t);
140 /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
141 * following limb, so these must be computed in order, and we need a
142 * one limb gap to tp. */
143 #define v0 pp /* 2n */
144 #define v1 (pp + 2 * n) /* 2n+1 */
145 #define vinf (pp + 6 * n) /* s+t */
146 #define v2 scratch /* 2n+1 */
147 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */
148 #define vh (scratch + 4 * n + 2) /* 2n+1 */
149 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */
150 #define tp (scratch + 8*n + 5)
152 /* apx and bpx must not overlap with v1 */
153 #define apx pp /* n+1 */
154 #define amx (pp + n + 1) /* n+1 */
155 #define bmx (pp + 2*n + 2) /* n+1 */
156 #define bpx (pp + 4*n + 2) /* n+1 */
158 /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
159 gives roughly 32 n/3 + log term. */
161 /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */
162 flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));
164 /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */
165 flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp)));
167 TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */
168 TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */
170 /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
171 #if HAVE_NATIVE_mpn_addlsh1_n
172 cy = mpn_addlsh1_n (apx, a1, a0, n);
173 cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
174 if (s < n)
176 mp_limb_t cy2;
177 cy2 = mpn_addlsh1_n (apx, a3, apx, s);
178 apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
179 MPN_INCR_U (apx + s, n+1-s, cy2);
181 else
182 apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
183 #else
184 cy = mpn_lshift (apx, a0, n, 1);
185 cy += mpn_add_n (apx, apx, a1, n);
186 cy = 2*cy + mpn_lshift (apx, apx, n, 1);
187 cy += mpn_add_n (apx, apx, a2, n);
188 cy = 2*cy + mpn_lshift (apx, apx, n, 1);
189 apx[n] = cy + mpn_add (apx, apx, n, a3, s);
190 #endif
192 /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
193 #if HAVE_NATIVE_mpn_addlsh1_n
194 cy = mpn_addlsh1_n (bpx, b1, b0, n);
195 cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
196 if (t < n)
198 mp_limb_t cy2;
199 cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
200 bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
201 MPN_INCR_U (bpx + t, n+1-t, cy2);
203 else
204 bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
205 #else
206 cy = mpn_lshift (bpx, b0, n, 1);
207 cy += mpn_add_n (bpx, bpx, b1, n);
208 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
209 cy += mpn_add_n (bpx, bpx, b2, n);
210 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
211 bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
212 #endif
214 ASSERT (apx[n] < 15);
215 ASSERT (bpx[n] < 15);
217 TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */
219 /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */
220 flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp)));
222 /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3. */
223 flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp)));
225 TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */
226 /* Clobbers amx, bmx. */
227 TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */
229 TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
230 if (s > t)
231 mpn_mul (vinf, a3, s, b3, t);
232 else
233 TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */
235 mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);