source/libs/gmp/gmp-src/mpn/generic/toom52_mul.c

   1 /* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
   2    times as large as bn.  Or more accurately, bn < an < 2 bn.
   3
   4    Contributed to the GNU project by Marco Bodrato.
   5
   6    The idea of applying toom to unbalanced multiplication is due to Marco
   7    Bodrato and Alberto Zanoni.
   8
   9    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
  10    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  11    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
  12
  13 Copyright 2009 Free Software Foundation, Inc.
  14
  15 This file is part of the GNU MP Library.
  16
  17 The GNU MP Library is free software; you can redistribute it and/or modify
  18 it under the terms of either:
  19
  20   * the GNU Lesser General Public License as published by the Free
  21     Software Foundation; either version 3 of the License, or (at your
  22     option) any later version.
  23
  24 or
  25
  26   * the GNU General Public License as published by the Free Software
  27     Foundation; either version 2 of the License, or (at your option) any
  28     later version.
  29
  30 or both in parallel, as here.
  31
  32 The GNU MP Library is distributed in the hope that it will be useful, but
  33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  34 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  35 for more details.
  36
  37 You should have received copies of the GNU General Public License and the
  38 GNU Lesser General Public License along with the GNU MP Library.  If not,
  39 see https://www.gnu.org/licenses/.  */
  40
  41
  42 #include "gmp.h"
  43 #include "gmp-impl.h"
  44
  45 /* Evaluate in: -2, -1, 0, +1, +2, +inf
  46
  47   <-s-><--n--><--n--><--n--><--n-->
  48    ___ ______ ______ ______ ______
  49   |a4_|___a3_|___a2_|___a1_|___a0_|
  50                         |b1|___b0_|
  51                         <t-><--n-->
  52
  53   v0  =  a0                  * b0      #   A(0)*B(0)
  54   v1  = (a0+ a1+ a2+ a3+  a4)*(b0+ b1) #   A(1)*B(1)      ah  <= 4   bh <= 1
  55   vm1 = (a0- a1+ a2- a3+  a4)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 2   bh  = 0
  56   v2  = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) #   A(2)*B(2)      ah  <= 30  bh <= 2
  57   vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) #  A(-2)*B(-2)    |ah| <= 20 |bh|<= 1
  58   vinf=                   a4 *     b1  # A(inf)*B(inf)
  59
  60   Some slight optimization in evaluation are taken from the paper:
  61   "Towards Optimal Toom-Cook Multiplication for Univariate and
  62   Multivariate Polynomials in Characteristic 2 and 0."
  63 */
  64
  65 void
  66 mpn_toom52_mul (mp_ptr pp,
  67                 mp_srcptr ap, mp_size_t an,
  68                 mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
  69 {
  70   mp_size_t n, s, t;
  71   enum toom6_flags flags;
  72
  73 #define a0  ap
  74 #define a1  (ap + n)
  75 #define a2  (ap + 2 * n)
  76 #define a3  (ap + 3 * n)
  77 #define a4  (ap + 4 * n)
  78 #define b0  bp
  79 #define b1  (bp + n)
  80
  81   n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1);
  82
  83   s = an - 4 * n;
  84   t = bn - n;
  85
  86   ASSERT (0 < s && s <= n);
  87   ASSERT (0 < t && t <= n);
  88
  89   /* Ensures that 5 values of n+1 limbs each fits in the product area.
  90      Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9,
  91      n = 8. */
  92   ASSERT (s+t >= 5);
  93
  94 #define v0    pp                                /* 2n */
  95 #define vm1   (scratch)                         /* 2n+1 */
  96 #define v1    (pp + 2 * n)                      /* 2n+1 */
  97 #define vm2   (scratch + 2 * n + 1)             /* 2n+1 */
  98 #define v2    (scratch + 4 * n + 2)             /* 2n+1 */
  99 #define vinf  (pp + 5 * n)                      /* s+t */
 100 #define bs1    pp                               /* n+1 */
 101 #define bsm1  (scratch + 2 * n + 2)             /* n   */
 102 #define asm1  (scratch + 3 * n + 3)             /* n+1 */
 103 #define asm2  (scratch + 4 * n + 4)             /* n+1 */
 104 #define bsm2  (pp + n + 1)                      /* n+1 */
 105 #define bs2   (pp + 2 * n + 2)                  /* n+1 */
 106 #define as2   (pp + 3 * n + 3)                  /* n+1 */
 107 #define as1   (pp + 4 * n + 4)                  /* n+1 */
 108
 109   /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because
 110      products will overwrite 2n+2 limbs. */
 111
 112 #define a0a2  scratch
 113 #define a1a3  asm1
 114
 115   /* Compute as2 and asm2.  */
 116   flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3));
 117
 118   /* Compute bs1 and bsm1.  */
 119   if (t == n)
 120     {
 121 #if HAVE_NATIVE_mpn_add_n_sub_n
 122       mp_limb_t cy;
 123
 124       if (mpn_cmp (b0, b1, n) < 0)
 125         {
 126           cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
 127           flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
 128         }
 129       else
 130         {
 131           cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
 132         }
 133       bs1[n] = cy >> 1;
 134 #else
 135       bs1[n] = mpn_add_n (bs1, b0, b1, n);
 136       if (mpn_cmp (b0, b1, n) < 0)
 137         {
 138           mpn_sub_n (bsm1, b1, b0, n);
 139           flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
 140         }
 141       else
 142         {
 143           mpn_sub_n (bsm1, b0, b1, n);
 144         }
 145 #endif
 146     }
 147   else
 148     {
 149       bs1[n] = mpn_add (bs1, b0, n, b1, t);
 150       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
 151         {
 152           mpn_sub_n (bsm1, b1, b0, t);
 153           MPN_ZERO (bsm1 + t, n - t);
 154           flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
 155         }
 156       else
 157         {
 158           mpn_sub (bsm1, b0, n, b1, t);
 159         }
 160     }
 161
 162   /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1  */
 163   mpn_add (bs2, bs1, n+1, b1, t);
 164   if (flags & toom6_vm1_neg )
 165     {
 166       bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
 167       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
 168     }
 169   else
 170     {
 171       bsm2[n] = 0;
 172       if (t == n)
 173         {
 174           if (mpn_cmp (bsm1, b1, n) < 0)
 175             {
 176               mpn_sub_n (bsm2, b1, bsm1, n);
 177               flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
 178             }
 179           else
 180             {
 181               mpn_sub_n (bsm2, bsm1, b1, n);
 182             }
 183         }
 184       else
 185         {
 186           if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
 187             {
 188               mpn_sub_n (bsm2, b1, bsm1, t);
 189               MPN_ZERO (bsm2 + t, n - t);
 190               flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
 191             }
 192           else
 193             {
 194               mpn_sub (bsm2, bsm1, n, b1, t);
 195             }
 196         }
 197     }
 198
 199   /* Compute as1 and asm1.  */
 200   flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2)));
 201
 202   ASSERT (as1[n] <= 4);
 203   ASSERT (bs1[n] <= 1);
 204   ASSERT (asm1[n] <= 2);
 205 /*   ASSERT (bsm1[n] <= 1); */
 206   ASSERT (as2[n] <=30);
 207   ASSERT (bs2[n] <= 2);
 208   ASSERT (asm2[n] <= 20);
 209   ASSERT (bsm2[n] <= 1);
 210
 211   /* vm1, 2n+1 limbs */
 212   mpn_mul (vm1, asm1, n+1, bsm1, n);  /* W4 */
 213
 214   /* vm2, 2n+1 limbs */
 215   mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
 216
 217   /* v2, 2n+1 limbs */
 218   mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
 219
 220   /* v1, 2n+1 limbs */
 221   mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
 222
 223   /* vinf, s+t limbs */   /* W0 */
 224   if (s > t)  mpn_mul (vinf, a4, s, b1, t);
 225   else        mpn_mul (vinf, b1, t, a4, s);
 226
 227   /* v0, 2n limbs */
 228   mpn_mul_n (v0, ap, bp, n);  /* W5 */
 229
 230   mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
 231
 232 #undef v0
 233 #undef vm1
 234 #undef v1
 235 #undef vm2
 236 #undef v2
 237 #undef vinf
 238 #undef bs1
 239 #undef bs2
 240 #undef bsm1
 241 #undef bsm2
 242 #undef asm1
 243 #undef asm2
 244 #undef as1
 245 #undef as2
 246 #undef a0a2
 247 #undef b0b2
 248 #undef a1a3
 249 #undef a0
 250 #undef a1
 251 #undef a2
 252 #undef a3
 253 #undef b0
 254 #undef b1
 255 #undef b2
 256
 257 }