source/libs/gmp/gmp-src/mpn/generic/mullo_n.c

   1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
   2    of their products.
   3
   4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   5
   6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
   7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
   8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10 Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of either:
  16
  17   * the GNU Lesser General Public License as published by the Free
  18     Software Foundation; either version 3 of the License, or (at your
  19     option) any later version.
  20
  21 or
  22
  23   * the GNU General Public License as published by the Free Software
  24     Foundation; either version 2 of the License, or (at your option) any
  25     later version.
  26
  27 or both in parallel, as here.
  28
  29 The GNU MP Library is distributed in the hope that it will be useful, but
  30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  32 for more details.
  33
  34 You should have received copies of the GNU General Public License and the
  35 GNU Lesser General Public License along with the GNU MP Library.  If not,
  36 see https://www.gnu.org/licenses/.  */
  37
  38 #include "gmp.h"
  39 #include "gmp-impl.h"
  40
  41
  42 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
  43 #define MAYBE_range_basecase 1
  44 #define MAYBE_range_toom22   1
  45 #else
  46 #define MAYBE_range_basecase                                           \
  47   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
  48 #define MAYBE_range_toom22                                             \
  49   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
  50 #endif
  51
  52 /*  THINK: The DC strategy uses different constants in different Toom's
  53          ranges. Something smoother?
  54 */
  55
  56 /*
  57   Compute the least significant half of the product {xy,n}*{yp,n}, or
  58   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
  59
  60   Above the given threshold, the Divide and Conquer strategy is used.
  61   The operands are split in two, and a full product plus two mullo
  62   are used to obtain the final result. The more natural strategy is to
  63   split in two halves, but this is far from optimal when a
  64   sub-quadratic multiplication is used.
  65
  66   Mulders suggests an unbalanced split in favour of the full product,
  67   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
  68
  69   To compute the value of a, we assume that the cost of mullo for a
  70   given size ML(n) is a fraction of the cost of a full product with
  71   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
  72   then we can write:
  73
  74   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
  75
  76   Given a value for e, want to minimise the value of k, i.e. the
  77   function k=(1-a)^e/(1-2*a^e).
  78
  79   With e=2, the exponent for schoolbook multiplication, the minimum is
  80   given by the values a=1-a=1/2.
  81
  82   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
  83   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
  84
  85   Other possible approximations follow:
  86   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
  87   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
  88   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
  89   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
  90
  91   The values above where obtained with the following trivial commands
  92   in the gp-pari shell:
  93
  94 fun(e,a)=(1-a)^e/(1-2*a^e)
  95 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
  96 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
  97 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
  98 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
  99 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
 100 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
 101
 102   ,
 103   |\
 104   | \
 105   +----,
 106   |    |
 107   |    |
 108   |    |\
 109   |    | \
 110   +----+--`
 111   ^ n2 ^n1^
 112
 113   For an actual implementation, the assumption that M(n)=n^e is
 114   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
 115   with a constant k is wrong.
 116
 117   But theory suggest us two things:
 118   - the best the multiplication product is (lower e), the more k
 119     approaches 1, and a approaches 0.
 120
 121   - A value for a smaller than optimal is probably less bad than a
 122     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
 123     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
 124     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
 125 */
 126
 127 static mp_size_t
 128 mpn_mullo_n_itch (mp_size_t n)
 129 {
 130   return 2*n;
 131 }
 132
 133 /*
 134     mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
 135     It accepts tp == rp.
 136 */
 137 static void
 138 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
 139 {
 140   mp_size_t n2, n1;
 141   ASSERT (n >= 2);
 142   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
 143   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
 144   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
 145
 146   /* Divide-and-conquer */
 147
 148   /* We need fractional approximation of the value 0 < a <= 1/2
 149      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
 150   */
 151   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
 152     n1 = n >> 1;
 153   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
 154     n1 = n * 11 / (size_t) 36;  /* n1 ~= n*(1-.694...) */
 155   else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
 156     n1 = n * 9 / (size_t) 40;   /* n1 ~= n*(1-.775...) */
 157   else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
 158     n1 = n * 7 / (size_t) 39;   /* n1 ~= n*(1-.821...) */
 159   /* n1 = n * 4 / (size_t) 31;  // n1 ~= n*(1-.871...) [TOOM66] */
 160   else
 161     n1 = n / (size_t) 10;               /* n1 ~= n*(1-.899...) [TOOM88] */
 162
 163   n2 = n - n1;
 164
 165   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
 166               y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
 167
 168   /* x0 * y0 */
 169   mpn_mul_n (tp, xp, yp, n2);
 170   MPN_COPY (rp, tp, n2);
 171
 172   /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
 173   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
 174     mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
 175   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
 176     mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
 177   else
 178     mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
 179   mpn_add_n (rp + n2, tp + n2, tp + n, n1);
 180
 181   /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
 182   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
 183     mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
 184   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
 185     mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
 186   else
 187     mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
 188   mpn_add_n (rp + n2, rp + n2, tp + n, n1);
 189 }
 190
 191 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
 192 #define MUL_BASECASE_ALLOC \
 193  (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
 194
 195 /* FIXME: This function should accept a temporary area; dc_mullow_n
 196    accepts a pointer tp, and handle the case tp == rp, do the same here.
 197    Maybe recombine the two functions.
 198    THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
 199           (typically thanks to mpn_addmul_2) should we unconditionally use
 200           mpn_mul_n?
 201 */
 202
 203 void
 204 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 205 {
 206   ASSERT (n >= 1);
 207   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
 208   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
 209
 210   if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
 211     {
 212       /* Allocate workspace of fixed size on stack: fast! */
 213       mp_limb_t tp[MUL_BASECASE_ALLOC];
 214       mpn_mul_basecase (tp, xp, n, yp, n);
 215       MPN_COPY (rp, tp, n);
 216     }
 217   else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
 218     {
 219       mpn_mullo_basecase (rp, xp, yp, n);
 220     }
 221   else
 222     {
 223       mp_ptr tp;
 224       TMP_DECL;
 225       TMP_MARK;
 226       tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
 227       if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
 228         {
 229           mpn_dc_mullo_n (rp, xp, yp, n, tp);
 230         }
 231       else
 232         {
 233           /* For really large operands, use plain mpn_mul_n but throw away upper n
 234              limbs of result.  */
 235 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
 236           mpn_fft_mul (tp, xp, n, yp, n);
 237 #else
 238           mpn_mul_n (tp, xp, yp, n);
 239 #endif
 240           MPN_COPY (rp, tp, n);
 241         }
 242       TMP_FREE;
 243     }
 244 }