3 Contributed by Niels Möller and Marco Bodrato.
5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
11 This file is part of the GNU MP Library.
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
26 or both in parallel, as here.
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
41 #define MUL(rp, ap, an, bp, bn) do { \
43 mpn_mul (rp, ap, an, bp, bn); \
45 mpn_mul (rp, bp, bn, ap, an); \
48 /* Inputs are unsigned. */
50 abs_sub_n (mp_ptr rp
, mp_srcptr ap
, mp_srcptr bp
, mp_size_t n
)
53 MPN_CMP (c
, ap
, bp
, n
);
56 mpn_sub_n (rp
, ap
, bp
, n
);
61 mpn_sub_n (rp
, bp
, ap
, n
);
67 add_signed_n (mp_ptr rp
,
68 mp_srcptr ap
, int as
, mp_srcptr bp
, int bs
, mp_size_t n
)
71 return as
^ abs_sub_n (rp
, ap
, bp
, n
);
74 ASSERT_NOCARRY (mpn_add_n (rp
, ap
, bp
, n
));
80 mpn_matrix22_mul_itch (mp_size_t rn
, mp_size_t mn
)
82 if (BELOW_THRESHOLD (rn
, MATRIX22_STRASSEN_THRESHOLD
)
83 || BELOW_THRESHOLD (mn
, MATRIX22_STRASSEN_THRESHOLD
))
86 return 3*(rn
+ mn
) + 5;
91 / s0 \ / 1 0 0 0 \ / r0 \
92 | s1 | | 0 1 0 1 | | r1 |
93 | s2 | | 0 0 -1 1 | | r2 |
94 | s3 | = | 0 1 -1 1 | \ r3 /
99 / t0 \ / 1 0 0 0 \ / m0 \
100 | t1 | | 0 1 0 1 | | m1 |
101 | t2 | | 0 0 -1 1 | | m2 |
102 | t3 | = | 0 1 -1 1 | \ m3 /
107 Note: the two matrices above are the same, but s_i and t_i are used
108 in the same product, only for i<4, see "A Strassen-like Matrix
109 Multiplication suited for squaring and higher power computation" by
110 M. Bodrato, in Proceedings of ISSAC 2010.
112 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \
113 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 |
114 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 |
115 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 |
120 The scheduling uses two temporaries U0 and U1 to store products, and
121 two, S0 and T0, to store combinations of entries of the two
125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
127 * Resulting elements are of size up to rn + mn + 1.
129 * Temporary storage: 3 rn + 3 mn + 5. */
131 mpn_matrix22_mul_strassen (mp_ptr r0
, mp_ptr r1
, mp_ptr r2
, mp_ptr r3
, mp_size_t rn
,
132 mp_srcptr m0
, mp_srcptr m1
, mp_srcptr m2
, mp_srcptr m3
, mp_size_t mn
,
135 mp_ptr s0
, t0
, u0
, u1
;
136 int r1s
, r3s
, s0s
, t0s
, u1s
;
137 s0
= tp
; tp
+= rn
+ 1;
138 t0
= tp
; tp
+= mn
+ 1;
139 u0
= tp
; tp
+= rn
+ mn
+ 1;
140 u1
= tp
; /* rn + mn + 2 */
142 MUL (u0
, r1
, rn
, m2
, mn
); /* u5 = s5 * t6 */
143 r3s
= abs_sub_n (r3
, r3
, r2
, rn
); /* r3 - r2 */
146 r1s
= abs_sub_n (r1
, r1
, r3
, rn
);
151 r1
[rn
] = mpn_add_n (r1
, r1
, r3
, rn
);
152 r1s
= 0; /* r1 - r2 + r3 */
156 s0
[rn
] = mpn_add_n (s0
, r1
, r0
, rn
);
159 else if (r1
[rn
] != 0)
161 s0
[rn
] = r1
[rn
] - mpn_sub_n (s0
, r1
, r0
, rn
);
162 s0s
= 1; /* s4 = -r0 + r1 - r2 + r3 */
167 s0s
= abs_sub_n (s0
, r0
, r1
, rn
);
170 MUL (u1
, r0
, rn
, m0
, mn
); /* u0 = s0 * t0 */
171 r0
[rn
+mn
] = mpn_add_n (r0
, u0
, u1
, rn
+ mn
);
172 ASSERT (r0
[rn
+mn
] < 2); /* u0 + u5 */
174 t0s
= abs_sub_n (t0
, m3
, m2
, mn
);
175 u1s
= r3s
^t0s
^1; /* Reverse sign! */
176 MUL (u1
, r3
, rn
, t0
, mn
); /* u2 = s2 * t2 */
180 t0s
= abs_sub_n (t0
, m1
, t0
, mn
);
185 t0
[mn
] = mpn_add_n (t0
, t0
, m1
, mn
);
188 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
189 at r3. I'd expect that for matrices of random size, the high
190 words t0[mn] and r1[rn] are non-zero with a pretty small
191 probability. If that can be confirmed this should be done as an
192 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
196 MUL (r3
, r1
, rn
, t0
, mn
+ 1); /* u3 = s3 * t3 */
199 mpn_add_n (r3
+ rn
, r3
+ rn
, t0
, mn
+ 1);
203 MUL (r3
, r1
, rn
+ 1, t0
, mn
);
206 ASSERT (r3
[rn
+mn
] < 4);
211 r3s
= abs_sub_n (r3
, u0
, r3
, rn
+ mn
+ 1);
215 ASSERT_NOCARRY (mpn_add_n (r3
, r3
, u0
, rn
+ mn
+ 1));
216 r3s
= 0; /* u3 + u5 */
221 t0
[mn
] = mpn_add_n (t0
, t0
, m0
, mn
);
223 else if (t0
[mn
] != 0)
225 t0
[mn
] -= mpn_sub_n (t0
, t0
, m0
, mn
);
229 t0s
= abs_sub_n (t0
, t0
, m0
, mn
);
231 MUL (u0
, r2
, rn
, t0
, mn
+ 1); /* u6 = s6 * t4 */
232 ASSERT (u0
[rn
+mn
] < 2);
235 ASSERT_NOCARRY (mpn_sub_n (r1
, r2
, r1
, rn
));
239 r1
[rn
] += mpn_add_n (r1
, r1
, r2
, rn
);
242 t0s
= add_signed_n (r2
, r3
, r3s
, u0
, t0s
, rn
+ mn
);
244 ASSERT (r2
[rn
+mn
-1] < 4);
245 r3s
= add_signed_n (r3
, r3
, r3s
, u1
, u1s
, rn
+ mn
);
247 ASSERT (r3
[rn
+mn
-1] < 3);
248 MUL (u0
, s0
, rn
, m1
, mn
); /* u4 = s4 * t5 */
249 ASSERT (u0
[rn
+mn
-1] < 2);
250 t0
[mn
] = mpn_add_n (t0
, m3
, m1
, mn
);
251 MUL (u1
, r1
, rn
, t0
, mn
+ 1); /* u1 = s1 * t1 */
253 ASSERT (u1
[mn
-1] < 4);
254 ASSERT (u1
[mn
] == 0);
255 ASSERT_NOCARRY (add_signed_n (r1
, r3
, r3s
, u0
, s0s
, mn
));
256 /* -u2 + u3 - u4 + u5 */
257 ASSERT (r1
[mn
-1] < 2);
260 ASSERT_NOCARRY (mpn_add_n (r3
, u1
, r3
, mn
));
264 ASSERT_NOCARRY (mpn_sub_n (r3
, u1
, r3
, mn
));
265 /* u1 + u2 - u3 - u5 */
267 ASSERT (r3
[mn
-1] < 2);
270 ASSERT_NOCARRY (mpn_add_n (r2
, u1
, r2
, mn
));
274 ASSERT_NOCARRY (mpn_sub_n (r2
, u1
, r2
, mn
));
275 /* u1 - u3 - u5 - u6 */
277 ASSERT (r2
[mn
-1] < 2);
281 mpn_matrix22_mul (mp_ptr r0
, mp_ptr r1
, mp_ptr r2
, mp_ptr r3
, mp_size_t rn
,
282 mp_srcptr m0
, mp_srcptr m1
, mp_srcptr m2
, mp_srcptr m3
, mp_size_t mn
,
285 if (BELOW_THRESHOLD (rn
, MATRIX22_STRASSEN_THRESHOLD
)
286 || BELOW_THRESHOLD (mn
, MATRIX22_STRASSEN_THRESHOLD
))
291 /* Temporary storage: 3 rn + 2 mn */
295 for (i
= 0; i
< 2; i
++)
297 MPN_COPY (tp
, r0
, rn
);
301 mpn_mul (p0
, r0
, rn
, m0
, mn
);
302 mpn_mul (p1
, r1
, rn
, m3
, mn
);
303 mpn_mul (r0
, r1
, rn
, m2
, mn
);
304 mpn_mul (r1
, tp
, rn
, m1
, mn
);
308 mpn_mul (p0
, m0
, mn
, r0
, rn
);
309 mpn_mul (p1
, m3
, mn
, r1
, rn
);
310 mpn_mul (r0
, m2
, mn
, r1
, rn
);
311 mpn_mul (r1
, m1
, mn
, tp
, rn
);
313 r0
[rn
+mn
] = mpn_add_n (r0
, r0
, p0
, rn
+ mn
);
314 r1
[rn
+mn
] = mpn_add_n (r1
, r1
, p1
, rn
+ mn
);
320 mpn_matrix22_mul_strassen (r0
, r1
, r2
, r3
, rn
,
321 m0
, m1
, m2
, m3
, mn
, tp
);