beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / matrix22_mul.c
blob59531eb1b26a1846abfd77090350a597c5faf9f2
1 /* matrix22_mul.c.
3 Contributed by Niels Möller and Marco Bodrato.
5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
11 This file is part of the GNU MP Library.
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
26 or both in parallel, as here.
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
37 #include "gmp.h"
38 #include "gmp-impl.h"
39 #include "longlong.h"
41 #define MUL(rp, ap, an, bp, bn) do { \
42 if (an >= bn) \
43 mpn_mul (rp, ap, an, bp, bn); \
44 else \
45 mpn_mul (rp, bp, bn, ap, an); \
46 } while (0)
48 /* Inputs are unsigned. */
49 static int
50 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
52 int c;
53 MPN_CMP (c, ap, bp, n);
54 if (c >= 0)
56 mpn_sub_n (rp, ap, bp, n);
57 return 0;
59 else
61 mpn_sub_n (rp, bp, ap, n);
62 return 1;
66 static int
67 add_signed_n (mp_ptr rp,
68 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
70 if (as != bs)
71 return as ^ abs_sub_n (rp, ap, bp, n);
72 else
74 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
75 return as;
79 mp_size_t
80 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
82 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
83 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
84 return 3*rn + 2*mn;
85 else
86 return 3*(rn + mn) + 5;
89 /* Algorithm:
91 / s0 \ / 1 0 0 0 \ / r0 \
92 | s1 | | 0 1 0 1 | | r1 |
93 | s2 | | 0 0 -1 1 | | r2 |
94 | s3 | = | 0 1 -1 1 | \ r3 /
95 | s4 | | -1 1 -1 1 |
96 | s5 | | 0 1 0 0 |
97 \ s6 / \ 0 0 1 0 /
99 / t0 \ / 1 0 0 0 \ / m0 \
100 | t1 | | 0 1 0 1 | | m1 |
101 | t2 | | 0 0 -1 1 | | m2 |
102 | t3 | = | 0 1 -1 1 | \ m3 /
103 | t4 | | -1 1 -1 1 |
104 | t5 | | 0 1 0 0 |
105 \ t6 / \ 0 0 1 0 /
107 Note: the two matrices above are the same, but s_i and t_i are used
108 in the same product, only for i<4, see "A Strassen-like Matrix
109 Multiplication suited for squaring and higher power computation" by
110 M. Bodrato, in Proceedings of ISSAC 2010.
112 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \
113 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 |
114 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 |
115 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 |
116 | s4*t5 |
117 | s5*t6 |
118 \ s6*t4 /
120 The scheduling uses two temporaries U0 and U1 to store products, and
121 two, S0 and T0, to store combinations of entries of the two
122 operands.
125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
127 * Resulting elements are of size up to rn + mn + 1.
129 * Temporary storage: 3 rn + 3 mn + 5. */
130 void
131 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
132 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
133 mp_ptr tp)
135 mp_ptr s0, t0, u0, u1;
136 int r1s, r3s, s0s, t0s, u1s;
137 s0 = tp; tp += rn + 1;
138 t0 = tp; tp += mn + 1;
139 u0 = tp; tp += rn + mn + 1;
140 u1 = tp; /* rn + mn + 2 */
142 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */
143 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */
144 if (r3s)
146 r1s = abs_sub_n (r1, r1, r3, rn);
147 r1[rn] = 0;
149 else
151 r1[rn] = mpn_add_n (r1, r1, r3, rn);
152 r1s = 0; /* r1 - r2 + r3 */
154 if (r1s)
156 s0[rn] = mpn_add_n (s0, r1, r0, rn);
157 s0s = 0;
159 else if (r1[rn] != 0)
161 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
162 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */
163 /* Reverse sign! */
165 else
167 s0s = abs_sub_n (s0, r0, r1, rn);
168 s0[rn] = 0;
170 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */
171 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
172 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */
174 t0s = abs_sub_n (t0, m3, m2, mn);
175 u1s = r3s^t0s^1; /* Reverse sign! */
176 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */
177 u1[rn+mn] = 0;
178 if (t0s)
180 t0s = abs_sub_n (t0, m1, t0, mn);
181 t0[mn] = 0;
183 else
185 t0[mn] = mpn_add_n (t0, t0, m1, mn);
188 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
189 at r3. I'd expect that for matrices of random size, the high
190 words t0[mn] and r1[rn] are non-zero with a pretty small
191 probability. If that can be confirmed this should be done as an
192 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
193 add_n. */
194 if (t0[mn] != 0)
196 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */
197 ASSERT (r1[rn] < 2);
198 if (r1[rn] != 0)
199 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
201 else
203 MUL (r3, r1, rn + 1, t0, mn);
206 ASSERT (r3[rn+mn] < 4);
208 u0[rn+mn] = 0;
209 if (r1s^t0s)
211 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
213 else
215 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
216 r3s = 0; /* u3 + u5 */
219 if (t0s)
221 t0[mn] = mpn_add_n (t0, t0, m0, mn);
223 else if (t0[mn] != 0)
225 t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
227 else
229 t0s = abs_sub_n (t0, t0, m0, mn);
231 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */
232 ASSERT (u0[rn+mn] < 2);
233 if (r1s)
235 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
237 else
239 r1[rn] += mpn_add_n (r1, r1, r2, rn);
241 rn++;
242 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
243 /* u3 + u5 + u6 */
244 ASSERT (r2[rn+mn-1] < 4);
245 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
246 /* -u2 + u3 + u5 */
247 ASSERT (r3[rn+mn-1] < 3);
248 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */
249 ASSERT (u0[rn+mn-1] < 2);
250 t0[mn] = mpn_add_n (t0, m3, m1, mn);
251 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */
252 mn += rn;
253 ASSERT (u1[mn-1] < 4);
254 ASSERT (u1[mn] == 0);
255 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
256 /* -u2 + u3 - u4 + u5 */
257 ASSERT (r1[mn-1] < 2);
258 if (r3s)
260 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
262 else
264 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
265 /* u1 + u2 - u3 - u5 */
267 ASSERT (r3[mn-1] < 2);
268 if (t0s)
270 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
272 else
274 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
275 /* u1 - u3 - u5 - u6 */
277 ASSERT (r2[mn-1] < 2);
280 void
281 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
282 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
283 mp_ptr tp)
285 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
286 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
288 mp_ptr p0, p1;
289 unsigned i;
291 /* Temporary storage: 3 rn + 2 mn */
292 p0 = tp + rn;
293 p1 = p0 + rn + mn;
295 for (i = 0; i < 2; i++)
297 MPN_COPY (tp, r0, rn);
299 if (rn >= mn)
301 mpn_mul (p0, r0, rn, m0, mn);
302 mpn_mul (p1, r1, rn, m3, mn);
303 mpn_mul (r0, r1, rn, m2, mn);
304 mpn_mul (r1, tp, rn, m1, mn);
306 else
308 mpn_mul (p0, m0, mn, r0, rn);
309 mpn_mul (p1, m3, mn, r1, rn);
310 mpn_mul (r0, m2, mn, r1, rn);
311 mpn_mul (r1, m1, mn, tp, rn);
313 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
314 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
316 r0 = r2; r1 = r3;
319 else
320 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
321 m0, m1, m2, m3, mn, tp);