source/libs/gmp/gmp-src/mpn/generic/toom53_mul.c

   1 /* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
   2    times as large as bn.  Or more accurately, (4/3)bn < an < (5/2)bn.
   3
   4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   5
   6    The idea of applying toom to unbalanced multiplication is due to Marco
   7    Bodrato and Alberto Zanoni.
   8
   9    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
  10    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  11    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
  12
  13 Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
  14
  15 This file is part of the GNU MP Library.
  16
  17 The GNU MP Library is free software; you can redistribute it and/or modify
  18 it under the terms of either:
  19
  20   * the GNU Lesser General Public License as published by the Free
  21     Software Foundation; either version 3 of the License, or (at your
  22     option) any later version.
  23
  24 or
  25
  26   * the GNU General Public License as published by the Free Software
  27     Foundation; either version 2 of the License, or (at your option) any
  28     later version.
  29
  30 or both in parallel, as here.
  31
  32 The GNU MP Library is distributed in the hope that it will be useful, but
  33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  34 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  35 for more details.
  36
  37 You should have received copies of the GNU General Public License and the
  38 GNU Lesser General Public License along with the GNU MP Library.  If not,
  39 see https://www.gnu.org/licenses/.  */
  40
  41
  42 #include "gmp.h"
  43 #include "gmp-impl.h"
  44
  45 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
  46
  47   <-s-><--n--><--n--><--n--><--n-->
  48    ___ ______ ______ ______ ______
  49   |a4_|___a3_|___a2_|___a1_|___a0_|
  50                |__b2|___b1_|___b0_|
  51                <-t--><--n--><--n-->
  52
  53   v0  =    a0                  *  b0          #    A(0)*B(0)
  54   v1  = (  a0+ a1+ a2+ a3+  a4)*( b0+ b1+ b2) #    A(1)*B(1)      ah  <= 4   bh <= 2
  55   vm1 = (  a0- a1+ a2- a3+  a4)*( b0- b1+ b2) #   A(-1)*B(-1)    |ah| <= 2   bh <= 1
  56   v2  = (  a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) #    A(2)*B(2)      ah  <= 30  bh <= 6
  57   vm2 = (  a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) #    A(2)*B(2)     -9<=ah<=20 -1<=bh<=4
  58   vh  = (16a0+8a1+4a2+2a3+  a4)*(4b0+2b1+ b2) #  A(1/2)*B(1/2)    ah  <= 30  bh <= 6
  59   vinf=                     a4 *          b2  #  A(inf)*B(inf)
  60 */
  61
  62 void
  63 mpn_toom53_mul (mp_ptr pp,
  64                 mp_srcptr ap, mp_size_t an,
  65                 mp_srcptr bp, mp_size_t bn,
  66                 mp_ptr scratch)
  67 {
  68   mp_size_t n, s, t;
  69   mp_limb_t cy;
  70   mp_ptr gp;
  71   mp_ptr as1, asm1, as2, asm2, ash;
  72   mp_ptr bs1, bsm1, bs2, bsm2, bsh;
  73   mp_ptr tmp;
  74   enum toom7_flags flags;
  75   TMP_DECL;
  76
  77 #define a0  ap
  78 #define a1  (ap + n)
  79 #define a2  (ap + 2*n)
  80 #define a3  (ap + 3*n)
  81 #define a4  (ap + 4*n)
  82 #define b0  bp
  83 #define b1  (bp + n)
  84 #define b2  (bp + 2*n)
  85
  86   n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
  87
  88   s = an - 4 * n;
  89   t = bn - 2 * n;
  90
  91   ASSERT (0 < s && s <= n);
  92   ASSERT (0 < t && t <= n);
  93
  94   TMP_MARK;
  95
  96   tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
  97   as1  = tmp; tmp += n + 1;
  98   asm1 = tmp; tmp += n + 1;
  99   as2  = tmp; tmp += n + 1;
 100   asm2 = tmp; tmp += n + 1;
 101   ash  = tmp; tmp += n + 1;
 102   bs1  = tmp; tmp += n + 1;
 103   bsm1 = tmp; tmp += n + 1;
 104   bs2  = tmp; tmp += n + 1;
 105   bsm2 = tmp; tmp += n + 1;
 106   bsh  = tmp; tmp += n + 1;
 107
 108   gp = pp;
 109
 110   /* Compute as1 and asm1.  */
 111   flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
 112
 113   /* Compute as2 and asm2. */
 114   flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)));
 115
 116   /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
 117      = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
 118 #if HAVE_NATIVE_mpn_addlsh1_n
 119   cy = mpn_addlsh1_n (ash, a1, a0, n);
 120   cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
 121   cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
 122   if (s < n)
 123     {
 124       mp_limb_t cy2;
 125       cy2 = mpn_addlsh1_n (ash, a4, ash, s);
 126       ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
 127       MPN_INCR_U (ash + s, n+1-s, cy2);
 128     }
 129   else
 130     ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
 131 #else
 132   cy = mpn_lshift (ash, a0, n, 1);
 133   cy += mpn_add_n (ash, ash, a1, n);
 134   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
 135   cy += mpn_add_n (ash, ash, a2, n);
 136   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
 137   cy += mpn_add_n (ash, ash, a3, n);
 138   cy = 2*cy + mpn_lshift (ash, ash, n, 1);
 139   ash[n] = cy + mpn_add (ash, ash, n, a4, s);
 140 #endif
 141
 142   /* Compute bs1 and bsm1.  */
 143   bs1[n] = mpn_add (bs1, b0, n, b2, t);         /* b0 + b2 */
 144 #if HAVE_NATIVE_mpn_add_n_sub_n
 145   if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
 146     {
 147       bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
 148       bsm1[n] = 0;
 149       flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
 150     }
 151   else
 152     {
 153       cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
 154       bsm1[n] = bs1[n] - (cy & 1);
 155       bs1[n] += (cy >> 1);
 156     }
 157 #else
 158   if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
 159     {
 160       mpn_sub_n (bsm1, b1, bs1, n);
 161       bsm1[n] = 0;
 162       flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
 163     }
 164   else
 165     {
 166       bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
 167     }
 168   bs1[n] += mpn_add_n (bs1, bs1, b1, n);  /* b0+b1+b2 */
 169 #endif
 170
 171   /* Compute bs2 and bsm2. */
 172 #if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
 173 #if HAVE_NATIVE_mpn_addlsh2_n
 174   cy = mpn_addlsh2_n (bs2, b0, b2, t);
 175 #else /* HAVE_NATIVE_mpn_addlsh_n */
 176   cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
 177 #endif
 178   if (t < n)
 179     cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
 180   bs2[n] = cy;
 181 #else
 182   cy = mpn_lshift (gp, b2, t, 2);
 183   bs2[n] = mpn_add (bs2, b0, n, gp, t);
 184   MPN_INCR_U (bs2 + t, n+1-t, cy);
 185 #endif
 186
 187   gp[n] = mpn_lshift (gp, b1, n, 1);
 188
 189 #if HAVE_NATIVE_mpn_add_n_sub_n
 190   if (mpn_cmp (bs2, gp, n+1) < 0)
 191     {
 192       ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
 193       flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
 194     }
 195   else
 196     {
 197       ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
 198     }
 199 #else
 200   if (mpn_cmp (bs2, gp, n+1) < 0)
 201     {
 202       ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
 203       flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
 204     }
 205   else
 206     {
 207       ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
 208     }
 209   mpn_add_n (bs2, bs2, gp, n+1);
 210 #endif
 211
 212   /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2.  */
 213 #if HAVE_NATIVE_mpn_addlsh1_n
 214   cy = mpn_addlsh1_n (bsh, b1, b0, n);
 215   if (t < n)
 216     {
 217       mp_limb_t cy2;
 218       cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
 219       bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
 220       MPN_INCR_U (bsh + t, n+1-t, cy2);
 221     }
 222   else
 223     bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
 224 #else
 225   cy = mpn_lshift (bsh, b0, n, 1);
 226   cy += mpn_add_n (bsh, bsh, b1, n);
 227   cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
 228   bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
 229 #endif
 230
 231   ASSERT (as1[n] <= 4);
 232   ASSERT (bs1[n] <= 2);
 233   ASSERT (asm1[n] <= 2);
 234   ASSERT (bsm1[n] <= 1);
 235   ASSERT (as2[n] <= 30);
 236   ASSERT (bs2[n] <= 6);
 237   ASSERT (asm2[n] <= 20);
 238   ASSERT (bsm2[n] <= 4);
 239   ASSERT (ash[n] <= 30);
 240   ASSERT (bsh[n] <= 6);
 241
 242 #define v0    pp                                /* 2n */
 243 #define v1    (pp + 2 * n)                      /* 2n+1 */
 244 #define vinf  (pp + 6 * n)                      /* s+t */
 245 #define v2    scratch                           /* 2n+1 */
 246 #define vm2   (scratch + 2 * n + 1)             /* 2n+1 */
 247 #define vh    (scratch + 4 * n + 2)             /* 2n+1 */
 248 #define vm1   (scratch + 6 * n + 3)             /* 2n+1 */
 249 #define scratch_out (scratch + 8 * n + 4)               /* 2n+1 */
 250   /* Total scratch need: 10*n+5 */
 251
 252   /* Must be in allocation order, as they overwrite one limb beyond
 253    * 2n+1. */
 254   mpn_mul_n (v2, as2, bs2, n + 1);              /* v2, 2n+1 limbs */
 255   mpn_mul_n (vm2, asm2, bsm2, n + 1);           /* vm2, 2n+1 limbs */
 256   mpn_mul_n (vh, ash, bsh, n + 1);              /* vh, 2n+1 limbs */
 257
 258   /* vm1, 2n+1 limbs */
 259 #ifdef SMALLER_RECURSION
 260   mpn_mul_n (vm1, asm1, bsm1, n);
 261   if (asm1[n] == 1)
 262     {
 263       cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
 264     }
 265   else if (asm1[n] == 2)
 266     {
 267 #if HAVE_NATIVE_mpn_addlsh1_n
 268       cy = 2 * bsm1[n] + mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
 269 #else
 270       cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
 271 #endif
 272     }
 273   else
 274     cy = 0;
 275   if (bsm1[n] != 0)
 276     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
 277   vm1[2 * n] = cy;
 278 #else /* SMALLER_RECURSION */
 279   vm1[2 * n] = 0;
 280   mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
 281 #endif /* SMALLER_RECURSION */
 282
 283   /* v1, 2n+1 limbs */
 284 #ifdef SMALLER_RECURSION
 285   mpn_mul_n (v1, as1, bs1, n);
 286   if (as1[n] == 1)
 287     {
 288       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
 289     }
 290   else if (as1[n] == 2)
 291     {
 292 #if HAVE_NATIVE_mpn_addlsh1_n
 293       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
 294 #else
 295       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
 296 #endif
 297     }
 298   else if (as1[n] != 0)
 299     {
 300       cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
 301     }
 302   else
 303     cy = 0;
 304   if (bs1[n] == 1)
 305     {
 306       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
 307     }
 308   else if (bs1[n] == 2)
 309     {
 310 #if HAVE_NATIVE_mpn_addlsh1_n
 311       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
 312 #else
 313       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
 314 #endif
 315     }
 316   v1[2 * n] = cy;
 317 #else /* SMALLER_RECURSION */
 318   v1[2 * n] = 0;
 319   mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
 320 #endif /* SMALLER_RECURSION */
 321
 322   mpn_mul_n (v0, a0, b0, n);                    /* v0, 2n limbs */
 323
 324   /* vinf, s+t limbs */
 325   if (s > t)  mpn_mul (vinf, a4, s, b2, t);
 326   else        mpn_mul (vinf, b2, t, a4, s);
 327
 328   mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
 329                              scratch_out);
 330
 331   TMP_FREE;
 332 }