beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / toom_interpolate_7pts.c
blob8a101496fa12b54741baf53d3c97c2c50616dbd5
1 /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
3 Contributed to the GNU project by Niels Möller.
4 Improvements by Marco Bodrato.
6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2006, 2007, 2009, 2014 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
27 or both in parallel, as here.
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
38 #include "gmp.h"
39 #include "gmp-impl.h"
41 #define BINVERT_3 MODLIMB_INVERSE_3
43 #define BINVERT_9 \
44 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
46 #define BINVERT_15 \
47 ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
49 /* For the various mpn_divexact_byN here, fall back to using either
50 mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is
51 many faster if it is native. For now, since mpn_divexact_1 is native on
52 several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
53 mpn_pi1_bdiv_q_1 unconditionally. FIXME. */
55 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
56 #ifndef mpn_divexact_by3
57 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
58 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
59 #else
60 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
61 #endif
62 #endif
64 #ifndef mpn_divexact_by9
65 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
66 #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
67 #else
68 #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
69 #endif
70 #endif
72 #ifndef mpn_divexact_by15
73 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
74 #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
75 #else
76 #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
77 #endif
78 #endif
80 /* Interpolation for toom4, using the evaluation points 0, infinity,
81 1, -1, 2, -2, 1/2. More precisely, we want to compute
82 f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
83 seven values
85 w0 = f(0),
86 w1 = f(-2),
87 w2 = f(1),
88 w3 = f(-1),
89 w4 = f(2)
90 w5 = 64 * f(1/2)
91 w6 = limit at infinity of f(x) / x^6,
93 The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
94 w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
95 w6n }. The other values are 2n + 1 limbs each (with most
96 significant limbs small). f(-1) and f(-1/2) may be negative, signs
97 determined by the flag bits. Inputs are destroyed.
99 Needs (2*n + 1) limbs of temporary storage.
102 void
103 mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
104 mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
105 mp_size_t w6n, mp_ptr tp)
107 mp_size_t m;
108 mp_limb_t cy;
110 m = 2*n + 1;
111 #define w0 rp
112 #define w2 (rp + 2*n)
113 #define w6 (rp + 6*n)
115 ASSERT (w6n > 0);
116 ASSERT (w6n <= 2*n);
118 /* Using formulas similar to Marco Bodrato's
120 W5 = W5 + W4
121 W1 =(W4 - W1)/2
122 W4 = W4 - W0
123 W4 =(W4 - W1)/4 - W6*16
124 W3 =(W2 - W3)/2
125 W2 = W2 - W3
127 W5 = W5 - W2*65 May be negative.
128 W2 = W2 - W6 - W0
129 W5 =(W5 + W2*45)/2 Now >= 0 again.
130 W4 =(W4 - W2)/3
131 W2 = W2 - W4
133 W1 = W5 - W1 May be negative.
134 W5 =(W5 - W3*8)/9
135 W3 = W3 - W5
136 W1 =(W1/15 + W5)/2 Now >= 0 again.
137 W5 = W5 - W1
139 where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
140 W4 = f(2), W5 = f(1/2), W6 = f(oo),
142 Note that most intermediate results are positive; the ones that
143 may be negative are represented in two's complement. We must
144 never shift right a value that may be negative, since that would
145 invalidate the sign bit. On the other hand, divexact by odd
146 numbers work fine with two's complement.
149 mpn_add_n (w5, w5, w4, m);
150 if (flags & toom7_w1_neg)
152 #ifdef HAVE_NATIVE_mpn_rsh1add_n
153 mpn_rsh1add_n (w1, w1, w4, m);
154 #else
155 mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1));
156 mpn_rshift (w1, w1, m, 1);
157 #endif
159 else
161 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
162 mpn_rsh1sub_n (w1, w4, w1, m);
163 #else
164 mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1));
165 mpn_rshift (w1, w1, m, 1);
166 #endif
168 mpn_sub (w4, w4, m, w0, 2*n);
169 mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3));
170 mpn_rshift (w4, w4, m, 2); /* w4>=0 */
172 tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
173 mpn_sub (w4, w4, m, tp, w6n+1);
175 if (flags & toom7_w3_neg)
177 #ifdef HAVE_NATIVE_mpn_rsh1add_n
178 mpn_rsh1add_n (w3, w3, w2, m);
179 #else
180 mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1));
181 mpn_rshift (w3, w3, m, 1);
182 #endif
184 else
186 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
187 mpn_rsh1sub_n (w3, w2, w3, m);
188 #else
189 mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1));
190 mpn_rshift (w3, w3, m, 1);
191 #endif
194 mpn_sub_n (w2, w2, w3, m);
196 mpn_submul_1 (w5, w2, m, 65);
197 mpn_sub (w2, w2, m, w6, w6n);
198 mpn_sub (w2, w2, m, w0, 2*n);
200 mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1));
201 mpn_rshift (w5, w5, m, 1);
202 mpn_sub_n (w4, w4, w2, m);
204 mpn_divexact_by3 (w4, w4, m);
205 mpn_sub_n (w2, w2, w4, m);
207 mpn_sub_n (w1, w5, w1, m);
208 mpn_lshift (tp, w3, m, 3);
209 mpn_sub_n (w5, w5, tp, m);
210 mpn_divexact_by9 (w5, w5, m);
211 mpn_sub_n (w3, w3, w5, m);
213 mpn_divexact_by15 (w1, w1, m);
214 mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1));
215 mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
216 mpn_sub_n (w5, w5, w1, m);
218 /* These bounds are valid for the 4x4 polynomial product of toom44,
219 * and they are conservative for toom53 and toom62. */
220 ASSERT (w1[2*n] < 2);
221 ASSERT (w2[2*n] < 3);
222 ASSERT (w3[2*n] < 4);
223 ASSERT (w4[2*n] < 3);
224 ASSERT (w5[2*n] < 2);
226 /* Addition chain. Note carries and the 2n'th limbs that need to be
227 * added in.
229 * Special care is needed for w2[2n] and the corresponding carry,
230 * since the "simple" way of adding it all together would overwrite
231 * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
232 * the high half of w3 and the low half of w4.
234 * 7 6 5 4 3 2 1 0
235 * | | | | | | | | |
236 * ||w3 (2n+1)|
237 * ||w4 (2n+1)|
238 * ||w5 (2n+1)| ||w1 (2n+1)|
239 * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r)
240 * -----------------------------------------------
241 * r | | | | | | | | |
242 * c7 c6 c5 c4 c3 Carries to propagate
245 cy = mpn_add_n (rp + n, rp + n, w1, m);
246 MPN_INCR_U (w2 + n + 1, n , cy);
247 cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
248 MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
249 cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
250 MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
251 cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
252 MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
253 if (w6n > n + 1)
255 cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
256 MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
258 else
260 ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
261 #if WANT_ASSERT
263 mp_size_t i;
264 for (i = w6n; i <= n; i++)
265 ASSERT (w5[n + i] == 0);
267 #endif