source/libs/gmp/gmp-src/mpn/generic/matrix22_mul.c

   1 /* matrix22_mul.c.
   2
   3    Contributed by Niels Möller and Marco Bodrato.
   4
   5    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of either:
  15
  16   * the GNU Lesser General Public License as published by the Free
  17     Software Foundation; either version 3 of the License, or (at your
  18     option) any later version.
  19
  20 or
  21
  22   * the GNU General Public License as published by the Free Software
  23     Foundation; either version 2 of the License, or (at your option) any
  24     later version.
  25
  26 or both in parallel, as here.
  27
  28 The GNU MP Library is distributed in the hope that it will be useful, but
  29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  31 for more details.
  32
  33 You should have received copies of the GNU General Public License and the
  34 GNU Lesser General Public License along with the GNU MP Library.  If not,
  35 see https://www.gnu.org/licenses/.  */
  36
  37 #include "gmp.h"
  38 #include "gmp-impl.h"
  39 #include "longlong.h"
  40
  41 #define MUL(rp, ap, an, bp, bn) do {            \
  42   if (an >= bn)                                 \
  43     mpn_mul (rp, ap, an, bp, bn);               \
  44   else                                          \
  45     mpn_mul (rp, bp, bn, ap, an);               \
  46 } while (0)
  47
  48 /* Inputs are unsigned. */
  49 static int
  50 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
  51 {
  52   int c;
  53   MPN_CMP (c, ap, bp, n);
  54   if (c >= 0)
  55     {
  56       mpn_sub_n (rp, ap, bp, n);
  57       return 0;
  58     }
  59   else
  60     {
  61       mpn_sub_n (rp, bp, ap, n);
  62       return 1;
  63     }
  64 }
  65
  66 static int
  67 add_signed_n (mp_ptr rp,
  68               mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
  69 {
  70   if (as != bs)
  71     return as ^ abs_sub_n (rp, ap, bp, n);
  72   else
  73     {
  74       ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
  75       return as;
  76     }
  77 }
  78
  79 mp_size_t
  80 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
  81 {
  82   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
  83       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
  84     return 3*rn + 2*mn;
  85   else
  86     return 3*(rn + mn) + 5;
  87 }
  88
  89 /* Algorithm:
  90
  91     / s0 \   /  1  0  0  0 \ / r0 \
  92     | s1 |   |  0  1  0  1 | | r1 |
  93     | s2 |   |  0  0 -1  1 | | r2 |
  94     | s3 | = |  0  1 -1  1 | \ r3 /
  95     | s4 |   | -1  1 -1  1 |
  96     | s5 |   |  0  1  0  0 |
  97     \ s6 /   \  0  0  1  0 /
  98
  99     / t0 \   /  1  0  0  0 \ / m0 \
 100     | t1 |   |  0  1  0  1 | | m1 |
 101     | t2 |   |  0  0 -1  1 | | m2 |
 102     | t3 | = |  0  1 -1  1 | \ m3 /
 103     | t4 |   | -1  1 -1  1 |
 104     | t5 |   |  0  1  0  0 |
 105     \ t6 /   \  0  0  1  0 /
 106
 107   Note: the two matrices above are the same, but s_i and t_i are used
 108   in the same product, only for i<4, see "A Strassen-like Matrix
 109   Multiplication suited for squaring and higher power computation" by
 110   M. Bodrato, in Proceedings of ISSAC 2010.
 111
 112     / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
 113     | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
 114     | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
 115     \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
 116                                     | s4*t5 |
 117                                     | s5*t6 |
 118                                     \ s6*t4 /
 119
 120   The scheduling uses two temporaries U0 and U1 to store products, and
 121   two, S0 and T0, to store combinations of entries of the two
 122   operands.
 123 */
 124
 125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
 126  *
 127  * Resulting elements are of size up to rn + mn + 1.
 128  *
 129  * Temporary storage: 3 rn + 3 mn + 5. */
 130 void
 131 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
 132                            mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
 133                            mp_ptr tp)
 134 {
 135   mp_ptr s0, t0, u0, u1;
 136   int r1s, r3s, s0s, t0s, u1s;
 137   s0 = tp; tp += rn + 1;
 138   t0 = tp; tp += mn + 1;
 139   u0 = tp; tp += rn + mn + 1;
 140   u1 = tp; /* rn + mn + 2 */
 141
 142   MUL (u0, r1, rn, m2, mn);             /* u5 = s5 * t6 */
 143   r3s = abs_sub_n (r3, r3, r2, rn);     /* r3 - r2 */
 144   if (r3s)
 145     {
 146       r1s = abs_sub_n (r1, r1, r3, rn);
 147       r1[rn] = 0;
 148     }
 149   else
 150     {
 151       r1[rn] = mpn_add_n (r1, r1, r3, rn);
 152       r1s = 0;                          /* r1 - r2 + r3  */
 153     }
 154   if (r1s)
 155     {
 156       s0[rn] = mpn_add_n (s0, r1, r0, rn);
 157       s0s = 0;
 158     }
 159   else if (r1[rn] != 0)
 160     {
 161       s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
 162       s0s = 1;                          /* s4 = -r0 + r1 - r2 + r3 */
 163                                         /* Reverse sign! */
 164     }
 165   else
 166     {
 167       s0s = abs_sub_n (s0, r0, r1, rn);
 168       s0[rn] = 0;
 169     }
 170   MUL (u1, r0, rn, m0, mn);             /* u0 = s0 * t0 */
 171   r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
 172   ASSERT (r0[rn+mn] < 2);               /* u0 + u5 */
 173
 174   t0s = abs_sub_n (t0, m3, m2, mn);
 175   u1s = r3s^t0s^1;                      /* Reverse sign! */
 176   MUL (u1, r3, rn, t0, mn);             /* u2 = s2 * t2 */
 177   u1[rn+mn] = 0;
 178   if (t0s)
 179     {
 180       t0s = abs_sub_n (t0, m1, t0, mn);
 181       t0[mn] = 0;
 182     }
 183   else
 184     {
 185       t0[mn] = mpn_add_n (t0, t0, m1, mn);
 186     }
 187
 188   /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
 189      at r3. I'd expect that for matrices of random size, the high
 190      words t0[mn] and r1[rn] are non-zero with a pretty small
 191      probability. If that can be confirmed this should be done as an
 192      unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
 193      add_n. */
 194   if (t0[mn] != 0)
 195     {
 196       MUL (r3, r1, rn, t0, mn + 1);     /* u3 = s3 * t3 */
 197       ASSERT (r1[rn] < 2);
 198       if (r1[rn] != 0)
 199         mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
 200     }
 201   else
 202     {
 203       MUL (r3, r1, rn + 1, t0, mn);
 204     }
 205
 206   ASSERT (r3[rn+mn] < 4);
 207
 208   u0[rn+mn] = 0;
 209   if (r1s^t0s)
 210     {
 211       r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
 212     }
 213   else
 214     {
 215       ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
 216       r3s = 0;                          /* u3 + u5 */
 217     }
 218
 219   if (t0s)
 220     {
 221       t0[mn] = mpn_add_n (t0, t0, m0, mn);
 222     }
 223   else if (t0[mn] != 0)
 224     {
 225       t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
 226     }
 227   else
 228     {
 229       t0s = abs_sub_n (t0, t0, m0, mn);
 230     }
 231   MUL (u0, r2, rn, t0, mn + 1);         /* u6 = s6 * t4 */
 232   ASSERT (u0[rn+mn] < 2);
 233   if (r1s)
 234     {
 235       ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
 236     }
 237   else
 238     {
 239       r1[rn] += mpn_add_n (r1, r1, r2, rn);
 240     }
 241   rn++;
 242   t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
 243                                         /* u3 + u5 + u6 */
 244   ASSERT (r2[rn+mn-1] < 4);
 245   r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
 246                                         /* -u2 + u3 + u5  */
 247   ASSERT (r3[rn+mn-1] < 3);
 248   MUL (u0, s0, rn, m1, mn);             /* u4 = s4 * t5 */
 249   ASSERT (u0[rn+mn-1] < 2);
 250   t0[mn] = mpn_add_n (t0, m3, m1, mn);
 251   MUL (u1, r1, rn, t0, mn + 1);         /* u1 = s1 * t1 */
 252   mn += rn;
 253   ASSERT (u1[mn-1] < 4);
 254   ASSERT (u1[mn] == 0);
 255   ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
 256                                         /* -u2 + u3 - u4 + u5  */
 257   ASSERT (r1[mn-1] < 2);
 258   if (r3s)
 259     {
 260       ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
 261     }
 262   else
 263     {
 264       ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
 265                                         /* u1 + u2 - u3 - u5  */
 266     }
 267   ASSERT (r3[mn-1] < 2);
 268   if (t0s)
 269     {
 270       ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
 271     }
 272   else
 273     {
 274       ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
 275                                         /* u1 - u3 - u5 - u6  */
 276     }
 277   ASSERT (r2[mn-1] < 2);
 278 }
 279
 280 void
 281 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
 282                   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
 283                   mp_ptr tp)
 284 {
 285   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
 286       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
 287     {
 288       mp_ptr p0, p1;
 289       unsigned i;
 290
 291       /* Temporary storage: 3 rn + 2 mn */
 292       p0 = tp + rn;
 293       p1 = p0 + rn + mn;
 294
 295       for (i = 0; i < 2; i++)
 296         {
 297           MPN_COPY (tp, r0, rn);
 298
 299           if (rn >= mn)
 300             {
 301               mpn_mul (p0, r0, rn, m0, mn);
 302               mpn_mul (p1, r1, rn, m3, mn);
 303               mpn_mul (r0, r1, rn, m2, mn);
 304               mpn_mul (r1, tp, rn, m1, mn);
 305             }
 306           else
 307             {
 308               mpn_mul (p0, m0, mn, r0, rn);
 309               mpn_mul (p1, m3, mn, r1, rn);
 310               mpn_mul (r0, m2, mn, r1, rn);
 311               mpn_mul (r1, m1, mn, tp, rn);
 312             }
 313           r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
 314           r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
 315
 316           r0 = r2; r1 = r3;
 317         }
 318     }
 319   else
 320     mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
 321                                m0, m1, m2, m3, mn, tp);
 322 }