source/libs/gmp/gmp-src/mpn/generic/mu_div_qr.c

   1 /* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
   2
   3    Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
   4    must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
   5    nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
   6    let N be unmodified during the operation.
   7
   8    Contributed to the GNU project by Torbjorn Granlund.
   9
  10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
  11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
  13
  14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
  15
  16 This file is part of the GNU MP Library.
  17
  18 The GNU MP Library is free software; you can redistribute it and/or modify
  19 it under the terms of either:
  20
  21   * the GNU Lesser General Public License as published by the Free
  22     Software Foundation; either version 3 of the License, or (at your
  23     option) any later version.
  24
  25 or
  26
  27   * the GNU General Public License as published by the Free Software
  28     Foundation; either version 2 of the License, or (at your option) any
  29     later version.
  30
  31 or both in parallel, as here.
  32
  33 The GNU MP Library is distributed in the hope that it will be useful, but
  34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  36 for more details.
  37
  38 You should have received copies of the GNU General Public License and the
  39 GNU Lesser General Public License along with the GNU MP Library.  If not,
  40 see https://www.gnu.org/licenses/.  */
  41
  42
  43 /*
  44    The idea of the algorithm used herein is to compute a smaller inverted value
  45    than used in the standard Barrett algorithm, and thus save time in the
  46    Newton iterations, and pay just a small price when using the inverted value
  47    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  48 */
  49
  50 /* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.
  51
  52  Things to work on:
  53
  54   * This isn't optimal when the quotient isn't needed, as it might take a lot
  55     of space.  The computation is always needed, though, so there is no time to
  56     save with special code.
  57
  58   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
  59     demonstrated by the fact that the mpn_invertappr function's scratch needs
  60     mean that we need to keep a large allocation long after it is needed.
  61     Things are worse as mpn_mul_fft does not accept any scratch parameter,
  62     which means we'll have a large memory hole while in mpn_mul_fft.  In
  63     general, a peak scratch need in the beginning of a function isn't
  64     well-handled by the itch/scratch scheme.
  65 */
  66
  67 #ifdef STAT
  68 #undef STAT
  69 #define STAT(x) x
  70 #else
  71 #define STAT(x)
  72 #endif
  73
  74 #include <stdlib.h>             /* for NULL */
  75 #include "gmp.h"
  76 #include "gmp-impl.h"
  77
  78
  79 /* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly.  It gives a
  80    speedup according to old measurements, but does the decision mechanism
  81    really make sense?  It seem like the quotient between dn and qn might be
  82    what we really should be checking.  */
  83 #ifndef MU_DIV_QR_SKEW_THRESHOLD
  84 #define MU_DIV_QR_SKEW_THRESHOLD 100
  85 #endif
  86
  87 #ifdef CHECK                            /* FIXME: Enable in minithres */
  88 #undef  MU_DIV_QR_SKEW_THRESHOLD
  89 #define MU_DIV_QR_SKEW_THRESHOLD 1
  90 #endif
  91
  92
  93 static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
  94
  95
  96 mp_limb_t
  97 mpn_mu_div_qr (mp_ptr qp,
  98                mp_ptr rp,
  99                mp_srcptr np,
 100                mp_size_t nn,
 101                mp_srcptr dp,
 102                mp_size_t dn,
 103                mp_ptr scratch)
 104 {
 105   mp_size_t qn;
 106   mp_limb_t cy, qh;
 107
 108   qn = nn - dn;
 109   if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)
 110     {
 111       /* |______________|_ign_first__|   dividend                         nn
 112                 |_______|_ign_first__|   divisor                          dn
 113
 114                 |______|             quotient (prel)                      qn
 115
 116                  |___________________|   quotient * ignored-divisor-part  dn-1
 117       */
 118
 119       /* Compute a preliminary quotient and a partial remainder by dividing the
 120          most significant limbs of each operand.  */
 121       qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
 122                            np + nn - (2 * qn + 1), 2 * qn + 1,
 123                            dp + dn - (qn + 1), qn + 1,
 124                            scratch);
 125
 126       /* Multiply the quotient by the divisor limbs ignored above.  */
 127       if (dn - (qn + 1) > qn)
 128         mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
 129       else
 130         mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
 131
 132       if (qh)
 133         cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));
 134       else
 135         cy = 0;
 136       scratch[dn - 1] = cy;
 137
 138       cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
 139       cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
 140                        rp + nn - (2 * qn + 1),
 141                        scratch + nn - (2 * qn + 1),
 142                        qn + 1, cy);
 143       if (cy)
 144         {
 145           qh -= mpn_sub_1 (qp, qp, qn, 1);
 146           mpn_add_n (rp, rp, dp, dn);
 147         }
 148     }
 149   else
 150     {
 151       qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
 152     }
 153
 154   return qh;
 155 }
 156
 157 static mp_limb_t
 158 mpn_mu_div_qr2 (mp_ptr qp,
 159                 mp_ptr rp,
 160                 mp_srcptr np,
 161                 mp_size_t nn,
 162                 mp_srcptr dp,
 163                 mp_size_t dn,
 164                 mp_ptr scratch)
 165 {
 166   mp_size_t qn, in;
 167   mp_limb_t cy, qh;
 168   mp_ptr ip, tp;
 169
 170   ASSERT (dn > 1);
 171
 172   qn = nn - dn;
 173
 174   /* Compute the inverse size.  */
 175   in = mpn_mu_div_qr_choose_in (qn, dn, 0);
 176   ASSERT (in <= dn);
 177
 178 #if 1
 179   /* This alternative inverse computation method gets slightly more accurate
 180      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
 181      not adapted (3) mpn_invertappr scratch needs not met.  */
 182   ip = scratch;
 183   tp = scratch + in + 1;
 184
 185   /* compute an approximate inverse on (in+1) limbs */
 186   if (dn == in)
 187     {
 188       MPN_COPY (tp + 1, dp, in);
 189       tp[0] = 1;
 190       mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 191       MPN_COPY_INCR (ip, ip + 1, in);
 192     }
 193   else
 194     {
 195       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
 196       if (UNLIKELY (cy != 0))
 197         MPN_ZERO (ip, in);
 198       else
 199         {
 200           mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 201           MPN_COPY_INCR (ip, ip + 1, in);
 202         }
 203     }
 204 #else
 205   /* This older inverse computation method gets slightly worse results than the
 206      one above.  */
 207   ip = scratch;
 208   tp = scratch + in;
 209
 210   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
 211      inversion function should do this automatically.  */
 212   if (dn == in)
 213     {
 214       tp[in + 1] = 0;
 215       MPN_COPY (tp + in + 2, dp, in);
 216       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
 217     }
 218   else
 219     {
 220       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
 221     }
 222   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
 223   if (UNLIKELY (cy != 0))
 224     MPN_ZERO (tp + 1, in);
 225   MPN_COPY (ip, tp + 1, in);
 226 #endif
 227
 228   qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
 229
 230   return qh;
 231 }
 232
 233 mp_limb_t
 234 mpn_preinv_mu_div_qr (mp_ptr qp,
 235                       mp_ptr rp,
 236                       mp_srcptr np,
 237                       mp_size_t nn,
 238                       mp_srcptr dp,
 239                       mp_size_t dn,
 240                       mp_srcptr ip,
 241                       mp_size_t in,
 242                       mp_ptr scratch)
 243 {
 244   mp_size_t qn;
 245   mp_limb_t cy, cx, qh;
 246   mp_limb_t r;
 247   mp_size_t tn, wn;
 248
 249 #define tp           scratch
 250 #define scratch_out  (scratch + tn)
 251
 252   qn = nn - dn;
 253
 254   np += qn;
 255   qp += qn;
 256
 257   qh = mpn_cmp (np, dp, dn) >= 0;
 258   if (qh != 0)
 259     mpn_sub_n (rp, np, dp, dn);
 260   else
 261     MPN_COPY_INCR (rp, np, dn);
 262
 263   /* if (qn == 0) */                    /* The while below handles this case */
 264   /*   return qh; */                    /* Degenerate use.  Should we allow this? */
 265
 266   while (qn > 0)
 267     {
 268       if (qn < in)
 269         {
 270           ip += in - qn;
 271           in = qn;
 272         }
 273       np -= in;
 274       qp -= in;
 275
 276       /* Compute the next block of quotient limbs by multiplying the inverse I
 277          by the upper part of the partial remainder R.  */
 278       mpn_mul_n (tp, rp + dn - in, ip, in);             /* mulhi  */
 279       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);   /* I's msb implicit */
 280       ASSERT_ALWAYS (cy == 0);
 281
 282       qn -= in;
 283
 284       /* Compute the product of the quotient block and the divisor D, to be
 285          subtracted from the partial remainder combined with new limbs from the
 286          dividend N.  We only really need the low dn+1 limbs.  */
 287
 288       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 289         mpn_mul (tp, dp, dn, qp, in);           /* dn+in limbs, high 'in' cancels */
 290       else
 291         {
 292           tn = mpn_mulmod_bnm1_next_size (dn + 1);
 293           mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 294           wn = dn + in - tn;                    /* number of wrapped limbs */
 295           if (wn > 0)
 296             {
 297               cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
 298               cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
 299               cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
 300               ASSERT_ALWAYS (cx >= cy);
 301               mpn_incr_u (tp, cx - cy);
 302             }
 303         }
 304
 305       r = rp[dn - in] - tp[dn];
 306
 307       /* Subtract the product from the partial remainder combined with new
 308          limbs from the dividend N, generating a new partial remainder R.  */
 309       if (dn != in)
 310         {
 311           cy = mpn_sub_n (tp, np, tp, in);      /* get next 'in' limbs from N */
 312           cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
 313           MPN_COPY (rp, tp, dn);                /* FIXME: try to avoid this */
 314         }
 315       else
 316         {
 317           cy = mpn_sub_n (rp, np, tp, in);      /* get next 'in' limbs from N */
 318         }
 319
 320       STAT (int i; int err = 0;
 321             static int errarr[5]; static int err_rec; static int tot);
 322
 323       /* Check the remainder R and adjust the quotient as needed.  */
 324       r -= cy;
 325       while (r != 0)
 326         {
 327           /* We loop 0 times with about 69% probability, 1 time with about 31%
 328              probability, 2 times with about 0.6% probability, if inverse is
 329              computed as recommended.  */
 330           mpn_incr_u (qp, 1);
 331           cy = mpn_sub_n (rp, rp, dp, dn);
 332           r -= cy;
 333           STAT (err++);
 334         }
 335       if (mpn_cmp (rp, dp, dn) >= 0)
 336         {
 337           /* This is executed with about 76% probability.  */
 338           mpn_incr_u (qp, 1);
 339           cy = mpn_sub_n (rp, rp, dp, dn);
 340           STAT (err++);
 341         }
 342
 343       STAT (
 344             tot++;
 345             errarr[err]++;
 346             if (err > err_rec)
 347               err_rec = err;
 348             if (tot % 0x10000 == 0)
 349               {
 350                 for (i = 0; i <= err_rec; i++)
 351                   printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
 352                 printf ("\n");
 353               }
 354             );
 355     }
 356
 357   return qh;
 358 }
 359
 360 /* In case k=0 (automatic choice), we distinguish 3 cases:
 361    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
 362    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
 363    (c) qn < dn/3:       in = qn
 364    In all cases we have in <= dn.
 365  */
 366 mp_size_t
 367 mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
 368 {
 369   mp_size_t in;
 370
 371   if (k == 0)
 372     {
 373       mp_size_t b;
 374       if (qn > dn)
 375         {
 376           /* Compute an inverse size that is a nice partition of the quotient.  */
 377           b = (qn - 1) / dn + 1;        /* ceil(qn/dn), number of blocks */
 378           in = (qn - 1) / b + 1;        /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 379         }
 380       else if (3 * qn > dn)
 381         {
 382           in = (qn - 1) / 2 + 1;        /* b = 2 */
 383         }
 384       else
 385         {
 386           in = (qn - 1) / 1 + 1;        /* b = 1 */
 387         }
 388     }
 389   else
 390     {
 391       mp_size_t xn;
 392       xn = MIN (dn, qn);
 393       in = (xn - 1) / k + 1;
 394     }
 395
 396   return in;
 397 }
 398
 399 mp_size_t
 400 mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 401 {
 402   mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
 403   mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in);
 404   mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
 405
 406   ASSERT (itch_preinv >= itch_invapp);
 407   return in + MAX (itch_invapp, itch_preinv);
 408 }
 409
 410 mp_size_t
 411 mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)
 412 {
 413   mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
 414   mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
 415
 416   return itch_local + itch_out;
 417 }