source/libs/gmp/gmp-src/mpn/generic/toom_interpolate_6pts.c

   1 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
   2
   3    Contributed to the GNU project by Marco Bodrato.
   4
   5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of either:
  15
  16   * the GNU Lesser General Public License as published by the Free
  17     Software Foundation; either version 3 of the License, or (at your
  18     option) any later version.
  19
  20 or
  21
  22   * the GNU General Public License as published by the Free Software
  23     Foundation; either version 2 of the License, or (at your option) any
  24     later version.
  25
  26 or both in parallel, as here.
  27
  28 The GNU MP Library is distributed in the hope that it will be useful, but
  29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  31 for more details.
  32
  33 You should have received copies of the GNU General Public License and the
  34 GNU Lesser General Public License along with the GNU MP Library.  If not,
  35 see https://www.gnu.org/licenses/.  */
  36
  37 #include "gmp.h"
  38 #include "gmp-impl.h"
  39
  40 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
  41 #ifndef mpn_divexact_by3
  42 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && MODLIMB_INVERSE_3
  43 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,MODLIMB_INVERSE_3,0)
  44 #else
  45 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
  46 #endif
  47 #endif
  48
  49 /* Interpolation for Toom-3.5, using the evaluation points: infinity,
  50    1, -1, 2, -2. More precisely, we want to compute
  51    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
  52    six values
  53
  54      w5 = f(0),
  55      w4 = f(-1),
  56      w3 = f(1)
  57      w2 = f(-2),
  58      w1 = f(2),
  59      w0 = limit at infinity of f(x) / x^5,
  60
  61    The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
  62    {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
  63    {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
  64    significant limbs small). f(-1) and f(-2) may be negative, signs
  65    determined by the flag bits. All intermediate results are positive.
  66    Inputs are destroyed.
  67
  68    Interpolation sequence was taken from the paper: "Integer and
  69    Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
  70    Some slight variations were introduced: adaptation to "gmp
  71    instruction set", and a final saving of an operation by interlacing
  72    interpolation and recomposition phases.
  73 */
  74
  75 void
  76 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
  77                            mp_ptr w4, mp_ptr w2, mp_ptr w1,
  78                            mp_size_t w0n)
  79 {
  80   mp_limb_t cy;
  81   /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
  82   mp_limb_t cy4, cy6, embankment;
  83
  84   ASSERT( n > 0 );
  85   ASSERT( 2*n >= w0n && w0n > 0 );
  86
  87 #define w5  pp                                  /* 2n   */
  88 #define w3  (pp + 2 * n)                        /* 2n+1 */
  89 #define w0  (pp + 5 * n)                        /* w0n  */
  90
  91   /* Interpolate with sequence:
  92      W2 =(W1 - W2)>>2
  93      W1 =(W1 - W5)>>1
  94      W1 =(W1 - W2)>>1
  95      W4 =(W3 - W4)>>1
  96      W2 =(W2 - W4)/3
  97      W3 = W3 - W4 - W5
  98      W1 =(W1 - W3)/3
  99      // Last steps are mixed with recomposition...
 100      W2 = W2 - W0<<2
 101      W4 = W4 - W2
 102      W3 = W3 - W1
 103      W2 = W2 - W0
 104   */
 105
 106   /* W2 =(W1 - W2)>>2 */
 107   if (flags & toom6_vm2_neg)
 108     mpn_add_n (w2, w1, w2, 2 * n + 1);
 109   else
 110     mpn_sub_n (w2, w1, w2, 2 * n + 1);
 111   mpn_rshift (w2, w2, 2 * n + 1, 2);
 112
 113   /* W1 =(W1 - W5)>>1 */
 114   w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
 115   mpn_rshift (w1, w1, 2 * n + 1, 1);
 116
 117   /* W1 =(W1 - W2)>>1 */
 118 #if HAVE_NATIVE_mpn_rsh1sub_n
 119   mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
 120 #else
 121   mpn_sub_n (w1, w1, w2, 2 * n + 1);
 122   mpn_rshift (w1, w1, 2 * n + 1, 1);
 123 #endif
 124
 125   /* W4 =(W3 - W4)>>1 */
 126   if (flags & toom6_vm1_neg)
 127     {
 128 #if HAVE_NATIVE_mpn_rsh1add_n
 129       mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
 130 #else
 131       mpn_add_n (w4, w3, w4, 2 * n + 1);
 132       mpn_rshift (w4, w4, 2 * n + 1, 1);
 133 #endif
 134     }
 135   else
 136     {
 137 #if HAVE_NATIVE_mpn_rsh1sub_n
 138       mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
 139 #else
 140       mpn_sub_n (w4, w3, w4, 2 * n + 1);
 141       mpn_rshift (w4, w4, 2 * n + 1, 1);
 142 #endif
 143     }
 144
 145   /* W2 =(W2 - W4)/3 */
 146   mpn_sub_n (w2, w2, w4, 2 * n + 1);
 147   mpn_divexact_by3 (w2, w2, 2 * n + 1);
 148
 149   /* W3 = W3 - W4 - W5 */
 150   mpn_sub_n (w3, w3, w4, 2 * n + 1);
 151   w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
 152
 153   /* W1 =(W1 - W3)/3 */
 154   mpn_sub_n (w1, w1, w3, 2 * n + 1);
 155   mpn_divexact_by3 (w1, w1, 2 * n + 1);
 156
 157   /*
 158     [1 0 0 0 0 0;
 159      0 1 0 0 0 0;
 160      1 0 1 0 0 0;
 161      0 1 0 1 0 0;
 162      1 0 1 0 1 0;
 163      0 0 0 0 0 1]
 164
 165     pp[] prior to operations:
 166      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
 167
 168     summation scheme for remaining operations:
 169      |______________5|n_____4|n_____3|n_____2|n______|n______|pp
 170      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
 171                                     || H w4  | L w4  |
 172                     || H w2  | L w2  |
 173             || H w1  | L w1  |
 174                             ||-H w1  |-L w1  |
 175                      |-H w0  |-L w0 ||-H w2  |-L w2  |
 176   */
 177   cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
 178   MPN_INCR_U (pp + 3 * n + 1, n, cy);
 179
 180   /* W2 -= W0<<2 */
 181 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
 182 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
 183   cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
 184 #else
 185   cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
 186 #endif
 187 #else
 188   /* {W4,2*n+1} is now free and can be overwritten. */
 189   cy = mpn_lshift(w4, w0, w0n, 2);
 190   cy+= mpn_sub_n(w2, w2, w4, w0n);
 191 #endif
 192   MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
 193
 194   /* W4L = W4L - W2L */
 195   cy = mpn_sub_n (pp + n, pp + n, w2, n);
 196   MPN_DECR_U (w3, 2 * n + 1, cy);
 197
 198   /* W3H = W3H + W2L */
 199   cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
 200   /* W1L + W2H */
 201   cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
 202   MPN_INCR_U (w1 + n, n + 1, cy);
 203
 204   /* W0 = W0 + W1H */
 205   if (LIKELY (w0n > n))
 206     cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
 207   else
 208     cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
 209
 210   /*
 211     summation scheme for the next operation:
 212      |...____5|n_____4|n_____3|n_____2|n______|n______|pp
 213      |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
 214                      ...-w0___|-w1_w2 |
 215   */
 216   /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
 217   cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
 218
 219   /* embankment is a "dirty trick" to avoid carry/borrow propagation
 220      beyond allocated memory */
 221   embankment = w0[w0n - 1] - 1;
 222   w0[w0n - 1] = 1;
 223   if (LIKELY (w0n > n)) {
 224     if (cy4 > cy6)
 225       MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
 226     else
 227       MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
 228     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
 229     MPN_INCR_U (w0 + n, w0n - n, cy6);
 230   } else {
 231     MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
 232     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
 233   }
 234   w0[w0n - 1] += embankment;
 235
 236 #undef w5
 237 #undef w3
 238 #undef w0
 239
 240 }