source/libs/gmp/gmp-src/mpn/generic/toom32_mul.c

   1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
   2    times as large as bn.  Or more accurately, bn < an < 3bn.
   3
   4    Contributed to the GNU project by Torbjorn Granlund.
   5    Improvements by Marco Bodrato and Niels Möller.
   6
   7    The idea of applying toom to unbalanced multiplication is due to Marco
   8    Bodrato and Alberto Zanoni.
   9
  10    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
  11    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  12    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
  13
  14 Copyright 2006-2010 Free Software Foundation, Inc.
  15
  16 This file is part of the GNU MP Library.
  17
  18 The GNU MP Library is free software; you can redistribute it and/or modify
  19 it under the terms of either:
  20
  21   * the GNU Lesser General Public License as published by the Free
  22     Software Foundation; either version 3 of the License, or (at your
  23     option) any later version.
  24
  25 or
  26
  27   * the GNU General Public License as published by the Free Software
  28     Foundation; either version 2 of the License, or (at your option) any
  29     later version.
  30
  31 or both in parallel, as here.
  32
  33 The GNU MP Library is distributed in the hope that it will be useful, but
  34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  36 for more details.
  37
  38 You should have received copies of the GNU General Public License and the
  39 GNU Lesser General Public License along with the GNU MP Library.  If not,
  40 see https://www.gnu.org/licenses/.  */
  41
  42
  43 #include "gmp.h"
  44 #include "gmp-impl.h"
  45
  46 /* Evaluate in: -1, 0, +1, +inf
  47
  48   <-s-><--n--><--n-->
  49    ___ ______ ______
  50   |a2_|___a1_|___a0_|
  51         |_b1_|___b0_|
  52         <-t--><--n-->
  53
  54   v0  =  a0         * b0      #   A(0)*B(0)
  55   v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
  56   vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
  57   vinf=          a2 *     b1  # A(inf)*B(inf)
  58 */
  59
  60 #define TOOM32_MUL_N_REC(p, a, b, n, ws)                                \
  61   do {                                                                  \
  62     mpn_mul_n (p, a, b, n);                                             \
  63   } while (0)
  64
  65 void
  66 mpn_toom32_mul (mp_ptr pp,
  67                 mp_srcptr ap, mp_size_t an,
  68                 mp_srcptr bp, mp_size_t bn,
  69                 mp_ptr scratch)
  70 {
  71   mp_size_t n, s, t;
  72   int vm1_neg;
  73   mp_limb_t cy;
  74   mp_limb_signed_t hi;
  75   mp_limb_t ap1_hi, bp1_hi;
  76
  77 #define a0  ap
  78 #define a1  (ap + n)
  79 #define a2  (ap + 2 * n)
  80 #define b0  bp
  81 #define b1  (bp + n)
  82
  83   /* Required, to ensure that s + t >= n. */
  84   ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
  85
  86   n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
  87
  88   s = an - 2 * n;
  89   t = bn - n;
  90
  91   ASSERT (0 < s && s <= n);
  92   ASSERT (0 < t && t <= n);
  93   ASSERT (s + t >= n);
  94
  95   /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
  96 #define ap1 (pp)                /* n, most significant limb in ap1_hi */
  97 #define bp1 (pp + n)            /* n, most significant bit in bp1_hi */
  98 #define am1 (pp + 2*n)          /* n, most significant bit in hi */
  99 #define bm1 (pp + 3*n)          /* n */
 100 #define v1 (scratch)            /* 2n + 1 */
 101 #define vm1 (pp)                /* 2n + 1 */
 102 #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
 103
 104   /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
 105
 106   /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
 107
 108   /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
 109   ap1_hi = mpn_add (ap1, a0, n, a2, s);
 110 #if HAVE_NATIVE_mpn_add_n_sub_n
 111   if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
 112     {
 113       ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
 114       hi = 0;
 115       vm1_neg = 1;
 116     }
 117   else
 118     {
 119       cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
 120       hi = ap1_hi - (cy & 1);
 121       ap1_hi += (cy >> 1);
 122       vm1_neg = 0;
 123     }
 124 #else
 125   if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
 126     {
 127       ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
 128       hi = 0;
 129       vm1_neg = 1;
 130     }
 131   else
 132     {
 133       hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
 134       vm1_neg = 0;
 135     }
 136   ap1_hi += mpn_add_n (ap1, ap1, a1, n);
 137 #endif
 138
 139   /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
 140   if (t == n)
 141     {
 142 #if HAVE_NATIVE_mpn_add_n_sub_n
 143       if (mpn_cmp (b0, b1, n) < 0)
 144         {
 145           cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
 146           vm1_neg ^= 1;
 147         }
 148       else
 149         {
 150           cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
 151         }
 152       bp1_hi = cy >> 1;
 153 #else
 154       bp1_hi = mpn_add_n (bp1, b0, b1, n);
 155
 156       if (mpn_cmp (b0, b1, n) < 0)
 157         {
 158           ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
 159           vm1_neg ^= 1;
 160         }
 161       else
 162         {
 163           ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
 164         }
 165 #endif
 166     }
 167   else
 168     {
 169       /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
 170       bp1_hi = mpn_add (bp1, b0, n, b1, t);
 171
 172       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
 173         {
 174           ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
 175           MPN_ZERO (bm1 + t, n - t);
 176           vm1_neg ^= 1;
 177         }
 178       else
 179         {
 180           ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
 181         }
 182     }
 183
 184   TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
 185   if (ap1_hi == 1)
 186     {
 187       cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
 188     }
 189   else if (ap1_hi == 2)
 190     {
 191 #if HAVE_NATIVE_mpn_addlsh1_n
 192       cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
 193 #else
 194       cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
 195 #endif
 196     }
 197   else
 198     cy = 0;
 199   if (bp1_hi != 0)
 200     cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
 201   v1[2 * n] = cy;
 202
 203   TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
 204   if (hi)
 205     hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
 206
 207   vm1[2*n] = hi;
 208
 209   /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
 210   if (vm1_neg)
 211     {
 212 #if HAVE_NATIVE_mpn_rsh1sub_n
 213       mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
 214 #else
 215       mpn_sub_n (v1, v1, vm1, 2*n+1);
 216       ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
 217 #endif
 218     }
 219   else
 220     {
 221 #if HAVE_NATIVE_mpn_rsh1add_n
 222       mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
 223 #else
 224       mpn_add_n (v1, v1, vm1, 2*n+1);
 225       ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
 226 #endif
 227     }
 228
 229   /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
 230
 231      y = x1 + x3 + (x0 + x2) * B
 232        = (x0 + x2) * B + (x0 + x2) - vm1.
 233
 234      y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
 235      follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
 236      (already in place, except for carry propagation).
 237
 238      We thus add
 239
 240    B^3  B^2   B    1
 241     |    |    |    |
 242    +-----+----+
 243  + |  x0 + x2 |
 244    +----+-----+----+
 245  +      |  x0 + x2 |
 246         +----------+
 247  -      |  vm1     |
 248  --+----++----+----+-
 249    | y2  | y1 | y0 |
 250    +-----+----+----+
 251
 252   Since we store y0 at the same location as the low half of x0 + x2, we
 253   need to do the middle sum first. */
 254
 255   hi = vm1[2*n];
 256   cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
 257   MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
 258
 259   /* FIXME: Can we get rid of this second vm1_neg conditional by
 260      swapping the location of +1 and -1 values? */
 261   if (vm1_neg)
 262     {
 263       cy = mpn_add_n (v1, v1, vm1, n);
 264       hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
 265       MPN_INCR_U (v1 + n, n+1, hi);
 266     }
 267   else
 268     {
 269       cy = mpn_sub_n (v1, v1, vm1, n);
 270       hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
 271       MPN_DECR_U (v1 + n, n+1, hi);
 272     }
 273
 274   TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
 275   /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
 276   if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
 277   else        mpn_mul (pp+3*n, b1, t, a2, s);
 278
 279   /* Remaining interpolation.
 280
 281      y * B + x0 + x3 B^3 - x0 B^2 - x3 B
 282      = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
 283      = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
 284        + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
 285      = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
 286        + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
 287
 288           B^4       B^3       B^2        B         1
 289  |         |         |         |         |         |
 290    +-------+                   +---------+---------+
 291    |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
 292    +------+----------+---------+---------+---------+
 293           |    y2    |  y1     |   y0    |
 294           ++---------+---------+---------+
 295           -| Hx0-Lx3 | - Lx0   |
 296            +---------+---------+
 297                       | - Hx3  |
 298                       +--------+
 299
 300     We must take into account the carry from Hx0 - Lx3.
 301   */
 302
 303   cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
 304   hi = scratch[2*n] + cy;
 305
 306   cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
 307   hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
 308
 309   hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
 310
 311   /* FIXME: Is support for s + t == n needed? */
 312   if (LIKELY (s + t > n))
 313     {
 314       hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
 315
 316       if (hi < 0)
 317         MPN_DECR_U (pp + 4*n, s+t-n, -hi);
 318       else
 319         MPN_INCR_U (pp + 4*n, s+t-n, hi);
 320     }
 321   else
 322     ASSERT (hi == 0);
 323 }