source/libs/gmp/gmp-src/mpn/generic/mul.c

   1 /* mpn_mul -- Multiply two natural numbers.
   2
   3    Contributed to the GNU project by Torbjorn Granlund.
   4
   5 Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
   6 2014 Free Software Foundation, Inc.
   7
   8 This file is part of the GNU MP Library.
   9
  10 The GNU MP Library is free software; you can redistribute it and/or modify
  11 it under the terms of either:
  12
  13   * the GNU Lesser General Public License as published by the Free
  14     Software Foundation; either version 3 of the License, or (at your
  15     option) any later version.
  16
  17 or
  18
  19   * the GNU General Public License as published by the Free Software
  20     Foundation; either version 2 of the License, or (at your option) any
  21     later version.
  22
  23 or both in parallel, as here.
  24
  25 The GNU MP Library is distributed in the hope that it will be useful, but
  26 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  27 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  28 for more details.
  29
  30 You should have received copies of the GNU General Public License and the
  31 GNU Lesser General Public License along with the GNU MP Library.  If not,
  32 see https://www.gnu.org/licenses/.  */
  33
  34 #include "gmp.h"
  35 #include "gmp-impl.h"
  36
  37
  38 #ifndef MUL_BASECASE_MAX_UN
  39 #define MUL_BASECASE_MAX_UN 500
  40 #endif
  41
  42 /* Areas where the different toom algorithms can be called (extracted
  43    from the t-toom*.c files, and ignoring small constant offsets):
  44
  45    1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
  46                                         4/7              6/7
  47                                        6/11
  48                                        |--------------------| toom22 (small)
  49                                                            || toom22 (large)
  50                                                        |xxxx| toom22 called
  51                       |-------------------------------------| toom32
  52                                          |xxxxxxxxxxxxxxxx| | toom32 called
  53                                                |------------| toom33
  54                                                           |x| toom33 called
  55              |---------------------------------|            | toom42
  56                       |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
  57                                        |--------------------| toom43
  58                                                |xxxxxxxxxx|   toom43 called
  59          |-----------------------------|                      toom52 (unused)
  60                                                    |--------| toom44
  61                                                    |xxxxxxxx| toom44 called
  62                               |--------------------|        | toom53
  63                                         |xxxxxx|              toom53 called
  64     |-------------------------|                               toom62 (unused)
  65                                            |----------------| toom54 (unused)
  66                       |--------------------|                  toom63
  67                               |xxxxxxxxx|                   | toom63 called
  68                           |---------------------------------| toom6h
  69                                                    |xxxxxxxx| toom6h called
  70                                   |-------------------------| toom8h (32 bit)
  71                  |------------------------------------------| toom8h (64 bit)
  72                                                    |xxxxxxxx| toom8h called
  73 */
  74
  75 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
  76 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
  77
  78 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
  79    (pointed to by VP, with VN limbs), and store the result at PRODP.  The
  80    result is UN + VN limbs.  Return the most significant limb of the result.
  81
  82    NOTE: The space pointed to by PRODP is overwritten before finished with U
  83    and V, so overlap is an error.
  84
  85    Argument constraints:
  86    1. UN >= VN.
  87    2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
  88       the multiplier and the multiplicand.  */
  89
  90 /*
  91   * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
  92     ideal lines of the surrounding algorithms.  Is that optimal?
  93
  94   * The toomX3 code now uses a structure similar to the one of toomX2, except
  95     that it loops longer in the unbalanced case.  The result is that the
  96     remaining area might have un < vn.  Should we fix the toomX2 code in a
  97     similar way?
  98
  99   * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
 100     therefore calls mpn_mul recursively for certain cases.
 101
 102   * Allocate static temp space using THRESHOLD variables (except for toom44
 103     when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
 104
 105   * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
 106     have the same vn threshold.  This is not true, we should actually use
 107     mul_basecase for slightly larger operands for toom32 than for toom22, and
 108     even larger for toom42.
 109
 110   * That problem is even more prevalent for toomX3.  We therefore use special
 111     THRESHOLD variables there.
 112
 113   * Is our ITCH allocation correct?
 114 */
 115
 116 #define ITCH (16*vn + 100)
 117
 118 mp_limb_t
 119 mpn_mul (mp_ptr prodp,
 120          mp_srcptr up, mp_size_t un,
 121          mp_srcptr vp, mp_size_t vn)
 122 {
 123   ASSERT (un >= vn);
 124   ASSERT (vn >= 1);
 125   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
 126   ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
 127
 128   if (un == vn)
 129     {
 130       if (up == vp)
 131         mpn_sqr (prodp, up, un);
 132       else
 133         mpn_mul_n (prodp, up, vp, un);
 134     }
 135   else if (vn < MUL_TOOM22_THRESHOLD)
 136     { /* plain schoolbook multiplication */
 137
 138       /* Unless un is very large, or else if have an applicable mpn_mul_N,
 139          perform basecase multiply directly.  */
 140       if (un <= MUL_BASECASE_MAX_UN
 141 #if HAVE_NATIVE_mpn_mul_2
 142           || vn <= 2
 143 #else
 144           || vn == 1
 145 #endif
 146           )
 147         mpn_mul_basecase (prodp, up, un, vp, vn);
 148       else
 149         {
 150           /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
 151              locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
 152              these pieces with the vp[] operand.  After each such partial
 153              multiplication (but the last) we copy the most significant vn
 154              limbs into a temporary buffer since that part would otherwise be
 155              overwritten by the next multiplication.  After the next
 156              multiplication, we add it back.  This illustrates the situation:
 157
 158                                                     -->vn<--
 159                                                       |  |<------- un ------->|
 160                                                          _____________________|
 161                                                         X                    /|
 162                                                       /XX__________________/  |
 163                                     _____________________                     |
 164                                    X                    /                     |
 165                                  /XX__________________/                       |
 166                _____________________                                          |
 167               /                    /                                          |
 168             /____________________/                                            |
 169             ==================================================================
 170
 171             The parts marked with X are the parts whose sums are copied into
 172             the temporary buffer.  */
 173
 174           mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
 175           mp_limb_t cy;
 176           ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
 177
 178           mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
 179           prodp += MUL_BASECASE_MAX_UN;
 180           MPN_COPY (tp, prodp, vn);             /* preserve high triangle */
 181           up += MUL_BASECASE_MAX_UN;
 182           un -= MUL_BASECASE_MAX_UN;
 183           while (un > MUL_BASECASE_MAX_UN)
 184             {
 185               mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
 186               cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
 187               mpn_incr_u (prodp + vn, cy);
 188               prodp += MUL_BASECASE_MAX_UN;
 189               MPN_COPY (tp, prodp, vn);         /* preserve high triangle */
 190               up += MUL_BASECASE_MAX_UN;
 191               un -= MUL_BASECASE_MAX_UN;
 192             }
 193           if (un > vn)
 194             {
 195               mpn_mul_basecase (prodp, up, un, vp, vn);
 196             }
 197           else
 198             {
 199               ASSERT (un > 0);
 200               mpn_mul_basecase (prodp, vp, vn, up, un);
 201             }
 202           cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
 203           mpn_incr_u (prodp + vn, cy);
 204         }
 205     }
 206   else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
 207     {
 208       /* Use ToomX2 variants */
 209       mp_ptr scratch;
 210       TMP_SDECL; TMP_SMARK;
 211
 212 #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
 213       scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
 214       ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
 215       ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
 216       ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
 217 #undef ITCH_TOOMX2
 218
 219       /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
 220          square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
 221          wise; we would get better balance by slightly moving the bound.  We
 222          will sometimes end up with un < vn, like in the X3 arm below.  */
 223       if (un >= 3 * vn)
 224         {
 225           mp_limb_t cy;
 226           mp_ptr ws;
 227
 228           /* The maximum ws usage is for the mpn_mul result.  */
 229           ws = TMP_SALLOC_LIMBS (4 * vn);
 230
 231           mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
 232           un -= 2 * vn;
 233           up += 2 * vn;
 234           prodp += 2 * vn;
 235
 236           while (un >= 3 * vn)
 237             {
 238               mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
 239               un -= 2 * vn;
 240               up += 2 * vn;
 241               cy = mpn_add_n (prodp, prodp, ws, vn);
 242               MPN_COPY (prodp + vn, ws + vn, 2 * vn);
 243               mpn_incr_u (prodp + vn, cy);
 244               prodp += 2 * vn;
 245             }
 246
 247           /* vn <= un < 3vn */
 248
 249           if (4 * un < 5 * vn)
 250             mpn_toom22_mul (ws, up, un, vp, vn, scratch);
 251           else if (4 * un < 7 * vn)
 252             mpn_toom32_mul (ws, up, un, vp, vn, scratch);
 253           else
 254             mpn_toom42_mul (ws, up, un, vp, vn, scratch);
 255
 256           cy = mpn_add_n (prodp, prodp, ws, vn);
 257           MPN_COPY (prodp + vn, ws + vn, un);
 258           mpn_incr_u (prodp + vn, cy);
 259         }
 260       else
 261         {
 262           if (4 * un < 5 * vn)
 263             mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
 264           else if (4 * un < 7 * vn)
 265             mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 266           else
 267             mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 268         }
 269       TMP_SFREE;
 270     }
 271   else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
 272            BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
 273     {
 274       /* Handle the largest operands that are not in the FFT range.  The 2nd
 275          condition makes very unbalanced operands avoid the FFT code (except
 276          perhaps as coefficient products of the Toom code.  */
 277
 278       if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
 279         {
 280           /* Use ToomX3 variants */
 281           mp_ptr scratch;
 282           TMP_DECL; TMP_MARK;
 283
 284 #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
 285           scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
 286           ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
 287           ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
 288           ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
 289           ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
 290           ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
 291           ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
 292 #undef ITCH_TOOMX3
 293
 294           if (2 * un >= 5 * vn)
 295             {
 296               mp_limb_t cy;
 297               mp_ptr ws;
 298
 299               /* The maximum ws usage is for the mpn_mul result.  */
 300               ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
 301
 302               if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 303                 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
 304               else
 305                 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
 306               un -= 2 * vn;
 307               up += 2 * vn;
 308               prodp += 2 * vn;
 309
 310               while (2 * un >= 5 * vn)  /* un >= 2.5vn */
 311                 {
 312                   if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 313                     mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
 314                   else
 315                     mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
 316                   un -= 2 * vn;
 317                   up += 2 * vn;
 318                   cy = mpn_add_n (prodp, prodp, ws, vn);
 319                   MPN_COPY (prodp + vn, ws + vn, 2 * vn);
 320                   mpn_incr_u (prodp + vn, cy);
 321                   prodp += 2 * vn;
 322                 }
 323
 324               /* vn / 2 <= un < 2.5vn */
 325
 326               if (un < vn)
 327                 mpn_mul (ws, vp, vn, up, un);
 328               else
 329                 mpn_mul (ws, up, un, vp, vn);
 330
 331               cy = mpn_add_n (prodp, prodp, ws, vn);
 332               MPN_COPY (prodp + vn, ws + vn, un);
 333               mpn_incr_u (prodp + vn, cy);
 334             }
 335           else
 336             {
 337               if (6 * un < 7 * vn)
 338                 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
 339               else if (2 * un < 3 * vn)
 340                 {
 341                   if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
 342                     mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 343                   else
 344                     mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
 345                 }
 346               else if (6 * un < 11 * vn)
 347                 {
 348                   if (4 * un < 7 * vn)
 349                     {
 350                       if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
 351                         mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
 352                       else
 353                         mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
 354                     }
 355                   else
 356                     {
 357                       if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
 358                         mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 359                       else
 360                         mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
 361                     }
 362                 }
 363               else
 364                 {
 365                   if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
 366                     mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
 367                   else
 368                     mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
 369                 }
 370             }
 371           TMP_FREE;
 372         }
 373       else
 374         {
 375           mp_ptr scratch;
 376           TMP_DECL; TMP_MARK;
 377
 378           if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
 379             {
 380               scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
 381               mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
 382             }
 383           else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
 384             {
 385               scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
 386               mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
 387             }
 388           else
 389             {
 390               scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
 391               mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
 392             }
 393           TMP_FREE;
 394         }
 395     }
 396   else
 397     {
 398       if (un >= 8 * vn)
 399         {
 400           mp_limb_t cy;
 401           mp_ptr ws;
 402           TMP_DECL; TMP_MARK;
 403
 404           /* The maximum ws usage is for the mpn_mul result.  */
 405           ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
 406
 407           mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
 408           un -= 3 * vn;
 409           up += 3 * vn;
 410           prodp += 3 * vn;
 411
 412           while (2 * un >= 7 * vn)      /* un >= 3.5vn  */
 413             {
 414               mpn_fft_mul (ws, up, 3 * vn, vp, vn);
 415               un -= 3 * vn;
 416               up += 3 * vn;
 417               cy = mpn_add_n (prodp, prodp, ws, vn);
 418               MPN_COPY (prodp + vn, ws + vn, 3 * vn);
 419               mpn_incr_u (prodp + vn, cy);
 420               prodp += 3 * vn;
 421             }
 422
 423           /* vn / 2 <= un < 3.5vn */
 424
 425           if (un < vn)
 426             mpn_mul (ws, vp, vn, up, un);
 427           else
 428             mpn_mul (ws, up, un, vp, vn);
 429
 430           cy = mpn_add_n (prodp, prodp, ws, vn);
 431           MPN_COPY (prodp + vn, ws + vn, un);
 432           mpn_incr_u (prodp + vn, cy);
 433
 434           TMP_FREE;
 435         }
 436       else
 437         mpn_fft_mul (prodp, up, un, vp, vn);
 438     }
 439
 440   return prodp[un + vn - 1];    /* historic */
 441 }