source/libs/gmp/gmp-src/mpn/generic/toom_interpolate_7pts.c

   1 /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
   2
   3    Contributed to the GNU project by Niels Möller.
   4    Improvements by Marco Bodrato.
   5
   6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10 Copyright 2006, 2007, 2009, 2014 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of either:
  16
  17   * the GNU Lesser General Public License as published by the Free
  18     Software Foundation; either version 3 of the License, or (at your
  19     option) any later version.
  20
  21 or
  22
  23   * the GNU General Public License as published by the Free Software
  24     Foundation; either version 2 of the License, or (at your option) any
  25     later version.
  26
  27 or both in parallel, as here.
  28
  29 The GNU MP Library is distributed in the hope that it will be useful, but
  30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  32 for more details.
  33
  34 You should have received copies of the GNU General Public License and the
  35 GNU Lesser General Public License along with the GNU MP Library.  If not,
  36 see https://www.gnu.org/licenses/.  */
  37
  38 #include "gmp.h"
  39 #include "gmp-impl.h"
  40
  41 #define BINVERT_3 MODLIMB_INVERSE_3
  42
  43 #define BINVERT_9 \
  44   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
  45
  46 #define BINVERT_15 \
  47   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
  48
  49 /* For the various mpn_divexact_byN here, fall back to using either
  50    mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
  51    many faster if it is native.  For now, since mpn_divexact_1 is native on
  52    several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
  53    mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
  54
  55 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
  56 #ifndef mpn_divexact_by3
  57 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
  58 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
  59 #else
  60 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
  61 #endif
  62 #endif
  63
  64 #ifndef mpn_divexact_by9
  65 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
  66 #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
  67 #else
  68 #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
  69 #endif
  70 #endif
  71
  72 #ifndef mpn_divexact_by15
  73 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
  74 #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
  75 #else
  76 #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
  77 #endif
  78 #endif
  79
  80 /* Interpolation for toom4, using the evaluation points 0, infinity,
  81    1, -1, 2, -2, 1/2. More precisely, we want to compute
  82    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
  83    seven values
  84
  85      w0 = f(0),
  86      w1 = f(-2),
  87      w2 = f(1),
  88      w3 = f(-1),
  89      w4 = f(2)
  90      w5 = 64 * f(1/2)
  91      w6 = limit at infinity of f(x) / x^6,
  92
  93    The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
  94    w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
  95    w6n }. The other values are 2n + 1 limbs each (with most
  96    significant limbs small). f(-1) and f(-1/2) may be negative, signs
  97    determined by the flag bits. Inputs are destroyed.
  98
  99    Needs (2*n + 1) limbs of temporary storage.
 100 */
 101
 102 void
 103 mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
 104                            mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
 105                            mp_size_t w6n, mp_ptr tp)
 106 {
 107   mp_size_t m;
 108   mp_limb_t cy;
 109
 110   m = 2*n + 1;
 111 #define w0 rp
 112 #define w2 (rp + 2*n)
 113 #define w6 (rp + 6*n)
 114
 115   ASSERT (w6n > 0);
 116   ASSERT (w6n <= 2*n);
 117
 118   /* Using formulas similar to Marco Bodrato's
 119
 120      W5 = W5 + W4
 121      W1 =(W4 - W1)/2
 122      W4 = W4 - W0
 123      W4 =(W4 - W1)/4 - W6*16
 124      W3 =(W2 - W3)/2
 125      W2 = W2 - W3
 126
 127      W5 = W5 - W2*65      May be negative.
 128      W2 = W2 - W6 - W0
 129      W5 =(W5 + W2*45)/2   Now >= 0 again.
 130      W4 =(W4 - W2)/3
 131      W2 = W2 - W4
 132
 133      W1 = W5 - W1         May be negative.
 134      W5 =(W5 - W3*8)/9
 135      W3 = W3 - W5
 136      W1 =(W1/15 + W5)/2   Now >= 0 again.
 137      W5 = W5 - W1
 138
 139      where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
 140            W4 = f(2), W5 = f(1/2), W6 = f(oo),
 141
 142      Note that most intermediate results are positive; the ones that
 143      may be negative are represented in two's complement. We must
 144      never shift right a value that may be negative, since that would
 145      invalidate the sign bit. On the other hand, divexact by odd
 146      numbers work fine with two's complement.
 147   */
 148
 149   mpn_add_n (w5, w5, w4, m);
 150   if (flags & toom7_w1_neg)
 151     {
 152 #ifdef HAVE_NATIVE_mpn_rsh1add_n
 153       mpn_rsh1add_n (w1, w1, w4, m);
 154 #else
 155       mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
 156       mpn_rshift (w1, w1, m, 1);
 157 #endif
 158     }
 159   else
 160     {
 161 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
 162       mpn_rsh1sub_n (w1, w4, w1, m);
 163 #else
 164       mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
 165       mpn_rshift (w1, w1, m, 1);
 166 #endif
 167     }
 168   mpn_sub (w4, w4, m, w0, 2*n);
 169   mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
 170   mpn_rshift (w4, w4, m, 2); /* w4>=0 */
 171
 172   tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
 173   mpn_sub (w4, w4, m, tp, w6n+1);
 174
 175   if (flags & toom7_w3_neg)
 176     {
 177 #ifdef HAVE_NATIVE_mpn_rsh1add_n
 178       mpn_rsh1add_n (w3, w3, w2, m);
 179 #else
 180       mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
 181       mpn_rshift (w3, w3, m, 1);
 182 #endif
 183     }
 184   else
 185     {
 186 #ifdef HAVE_NATIVE_mpn_rsh1sub_n
 187       mpn_rsh1sub_n (w3, w2, w3, m);
 188 #else
 189       mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
 190       mpn_rshift (w3, w3, m, 1);
 191 #endif
 192     }
 193
 194   mpn_sub_n (w2, w2, w3, m);
 195
 196   mpn_submul_1 (w5, w2, m, 65);
 197   mpn_sub (w2, w2, m, w6, w6n);
 198   mpn_sub (w2, w2, m, w0, 2*n);
 199
 200   mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
 201   mpn_rshift (w5, w5, m, 1);
 202   mpn_sub_n (w4, w4, w2, m);
 203
 204   mpn_divexact_by3 (w4, w4, m);
 205   mpn_sub_n (w2, w2, w4, m);
 206
 207   mpn_sub_n (w1, w5, w1, m);
 208   mpn_lshift (tp, w3, m, 3);
 209   mpn_sub_n (w5, w5, tp, m);
 210   mpn_divexact_by9 (w5, w5, m);
 211   mpn_sub_n (w3, w3, w5, m);
 212
 213   mpn_divexact_by15 (w1, w1, m);
 214   mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
 215   mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
 216   mpn_sub_n (w5, w5, w1, m);
 217
 218   /* These bounds are valid for the 4x4 polynomial product of toom44,
 219    * and they are conservative for toom53 and toom62. */
 220   ASSERT (w1[2*n] < 2);
 221   ASSERT (w2[2*n] < 3);
 222   ASSERT (w3[2*n] < 4);
 223   ASSERT (w4[2*n] < 3);
 224   ASSERT (w5[2*n] < 2);
 225
 226   /* Addition chain. Note carries and the 2n'th limbs that need to be
 227    * added in.
 228    *
 229    * Special care is needed for w2[2n] and the corresponding carry,
 230    * since the "simple" way of adding it all together would overwrite
 231    * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
 232    * the high half of w3 and the low half of w4.
 233    *
 234    *         7    6    5    4    3    2    1    0
 235    *    |    |    |    |    |    |    |    |    |
 236    *                  ||w3 (2n+1)|
 237    *             ||w4 (2n+1)|
 238    *        ||w5 (2n+1)|        ||w1 (2n+1)|
 239    *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
 240    *  -----------------------------------------------
 241    *  r |    |    |    |    |    |    |    |    |
 242    *        c7   c6   c5   c4   c3                 Carries to propagate
 243    */
 244
 245   cy = mpn_add_n (rp + n, rp + n, w1, m);
 246   MPN_INCR_U (w2 + n + 1, n , cy);
 247   cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
 248   MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
 249   cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
 250   MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
 251   cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
 252   MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
 253   if (w6n > n + 1)
 254     {
 255       cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
 256       MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
 257     }
 258   else
 259     {
 260       ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
 261 #if WANT_ASSERT
 262       {
 263         mp_size_t i;
 264         for (i = w6n; i <= n; i++)
 265           ASSERT (w5[n + i] == 0);
 266       }
 267 #endif
 268     }
 269 }