source/libs/gmp/gmp-src/mpn/generic/mu_divappr_q.c

   1 /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
   2
   3    Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
   4    normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
   5    is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
   6    to let N be unmodified during the operation.
   7
   8    Contributed to the GNU project by Torbjorn Granlund.
   9
  10    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
  11    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
  12    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
  13
  14 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
  15
  16 This file is part of the GNU MP Library.
  17
  18 The GNU MP Library is free software; you can redistribute it and/or modify
  19 it under the terms of either:
  20
  21   * the GNU Lesser General Public License as published by the Free
  22     Software Foundation; either version 3 of the License, or (at your
  23     option) any later version.
  24
  25 or
  26
  27   * the GNU General Public License as published by the Free Software
  28     Foundation; either version 2 of the License, or (at your option) any
  29     later version.
  30
  31 or both in parallel, as here.
  32
  33 The GNU MP Library is distributed in the hope that it will be useful, but
  34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  36 for more details.
  37
  38 You should have received copies of the GNU General Public License and the
  39 GNU Lesser General Public License along with the GNU MP Library.  If not,
  40 see https://www.gnu.org/licenses/.  */
  41
  42
  43 /*
  44    The idea of the algorithm used herein is to compute a smaller inverted value
  45    than used in the standard Barrett algorithm, and thus save time in the
  46    Newton iterations, and pay just a small price when using the inverted value
  47    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  48 */
  49
  50 /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
  51
  52  Things to work on:
  53
  54   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
  55     demonstrated by the fact that the mpn_invertappr function's scratch needs
  56     mean that we need to keep a large allocation long after it is needed.
  57     Things are worse as mpn_mul_fft does not accept any scratch parameter,
  58     which means we'll have a large memory hole while in mpn_mul_fft.  In
  59     general, a peak scratch need in the beginning of a function isn't
  60     well-handled by the itch/scratch scheme.
  61 */
  62
  63 #ifdef STAT
  64 #undef STAT
  65 #define STAT(x) x
  66 #else
  67 #define STAT(x)
  68 #endif
  69
  70 #include <stdlib.h>             /* for NULL */
  71 #include "gmp.h"
  72 #include "gmp-impl.h"
  73
  74
  75 mp_limb_t
  76 mpn_mu_divappr_q (mp_ptr qp,
  77                   mp_srcptr np,
  78                   mp_size_t nn,
  79                   mp_srcptr dp,
  80                   mp_size_t dn,
  81                   mp_ptr scratch)
  82 {
  83   mp_size_t qn, in;
  84   mp_limb_t cy, qh;
  85   mp_ptr ip, tp;
  86
  87   ASSERT (dn > 1);
  88
  89   qn = nn - dn;
  90
  91   /* If Q is smaller than D, truncate operands. */
  92   if (qn + 1 < dn)
  93     {
  94       np += dn - (qn + 1);
  95       nn -= dn - (qn + 1);
  96       dp += dn - (qn + 1);
  97       dn = qn + 1;
  98     }
  99
 100   /* Compute the inverse size.  */
 101   in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
 102   ASSERT (in <= dn);
 103
 104 #if 1
 105   /* This alternative inverse computation method gets slightly more accurate
 106      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
 107      not adapted (3) mpn_invertappr scratch needs not met.  */
 108   ip = scratch;
 109   tp = scratch + in + 1;
 110
 111   /* compute an approximate inverse on (in+1) limbs */
 112   if (dn == in)
 113     {
 114       MPN_COPY (tp + 1, dp, in);
 115       tp[0] = 1;
 116       mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 117       MPN_COPY_INCR (ip, ip + 1, in);
 118     }
 119   else
 120     {
 121       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
 122       if (UNLIKELY (cy != 0))
 123         MPN_ZERO (ip, in);
 124       else
 125         {
 126           mpn_invertappr (ip, tp, in + 1, tp + in + 1);
 127           MPN_COPY_INCR (ip, ip + 1, in);
 128         }
 129     }
 130 #else
 131   /* This older inverse computation method gets slightly worse results than the
 132      one above.  */
 133   ip = scratch;
 134   tp = scratch + in;
 135
 136   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
 137      inversion function should do this automatically.  */
 138   if (dn == in)
 139     {
 140       tp[in + 1] = 0;
 141       MPN_COPY (tp + in + 2, dp, in);
 142       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
 143     }
 144   else
 145     {
 146       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
 147     }
 148   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
 149   if (UNLIKELY (cy != 0))
 150     MPN_ZERO (tp + 1, in);
 151   MPN_COPY (ip, tp + 1, in);
 152 #endif
 153
 154   qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
 155
 156   return qh;
 157 }
 158
 159 mp_limb_t
 160 mpn_preinv_mu_divappr_q (mp_ptr qp,
 161                          mp_srcptr np,
 162                          mp_size_t nn,
 163                          mp_srcptr dp,
 164                          mp_size_t dn,
 165                          mp_srcptr ip,
 166                          mp_size_t in,
 167                          mp_ptr scratch)
 168 {
 169   mp_size_t qn;
 170   mp_limb_t cy, cx, qh;
 171   mp_limb_t r;
 172   mp_size_t tn, wn;
 173
 174 #define rp           scratch
 175 #define tp           (scratch + dn)
 176 #define scratch_out  (scratch + dn + tn)
 177
 178   qn = nn - dn;
 179
 180   np += qn;
 181   qp += qn;
 182
 183   qh = mpn_cmp (np, dp, dn) >= 0;
 184   if (qh != 0)
 185     mpn_sub_n (rp, np, dp, dn);
 186   else
 187     MPN_COPY (rp, np, dn);
 188
 189   if (qn == 0)
 190     return qh;                  /* Degenerate use.  Should we allow this? */
 191
 192   while (qn > 0)
 193     {
 194       if (qn < in)
 195         {
 196           ip += in - qn;
 197           in = qn;
 198         }
 199       np -= in;
 200       qp -= in;
 201
 202       /* Compute the next block of quotient limbs by multiplying the inverse I
 203          by the upper part of the partial remainder R.  */
 204       mpn_mul_n (tp, rp + dn - in, ip, in);             /* mulhi  */
 205       cy = mpn_add_n (qp, tp + in, rp + dn - in, in);   /* I's msb implicit */
 206       ASSERT_ALWAYS (cy == 0);
 207
 208       qn -= in;
 209       if (qn == 0)
 210         break;
 211
 212       /* Compute the product of the quotient block and the divisor D, to be
 213          subtracted from the partial remainder combined with new limbs from the
 214          dividend N.  We only really need the low dn limbs.  */
 215
 216       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 217         mpn_mul (tp, dp, dn, qp, in);           /* dn+in limbs, high 'in' cancels */
 218       else
 219         {
 220           tn = mpn_mulmod_bnm1_next_size (dn + 1);
 221           mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 222           wn = dn + in - tn;                    /* number of wrapped limbs */
 223           if (wn > 0)
 224             {
 225               cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
 226               cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
 227               cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
 228               ASSERT_ALWAYS (cx >= cy);
 229               mpn_incr_u (tp, cx - cy);
 230             }
 231         }
 232
 233       r = rp[dn - in] - tp[dn];
 234
 235       /* Subtract the product from the partial remainder combined with new
 236          limbs from the dividend N, generating a new partial remainder R.  */
 237       if (dn != in)
 238         {
 239           cy = mpn_sub_n (tp, np, tp, in);      /* get next 'in' limbs from N */
 240           cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
 241           MPN_COPY (rp, tp, dn);                /* FIXME: try to avoid this */
 242         }
 243       else
 244         {
 245           cy = mpn_sub_n (rp, np, tp, in);      /* get next 'in' limbs from N */
 246         }
 247
 248       STAT (int i; int err = 0;
 249             static int errarr[5]; static int err_rec; static int tot);
 250
 251       /* Check the remainder R and adjust the quotient as needed.  */
 252       r -= cy;
 253       while (r != 0)
 254         {
 255           /* We loop 0 times with about 69% probability, 1 time with about 31%
 256              probability, 2 times with about 0.6% probability, if inverse is
 257              computed as recommended.  */
 258           mpn_incr_u (qp, 1);
 259           cy = mpn_sub_n (rp, rp, dp, dn);
 260           r -= cy;
 261           STAT (err++);
 262         }
 263       if (mpn_cmp (rp, dp, dn) >= 0)
 264         {
 265           /* This is executed with about 76% probability.  */
 266           mpn_incr_u (qp, 1);
 267           cy = mpn_sub_n (rp, rp, dp, dn);
 268           STAT (err++);
 269         }
 270
 271       STAT (
 272             tot++;
 273             errarr[err]++;
 274             if (err > err_rec)
 275               err_rec = err;
 276             if (tot % 0x10000 == 0)
 277               {
 278                 for (i = 0; i <= err_rec; i++)
 279                   printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
 280                 printf ("\n");
 281               }
 282             );
 283     }
 284
 285   /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
 286      quotient.  For now, just make sure the returned quotient is >= the real
 287      quotient; add 3 with saturating arithmetic.  */
 288   qn = nn - dn;
 289   cy += mpn_add_1 (qp, qp, qn, 3);
 290   if (cy != 0)
 291     {
 292       if (qh != 0)
 293         {
 294           /* Return a quotient of just 1-bits, with qh set.  */
 295           mp_size_t i;
 296           for (i = 0; i < qn; i++)
 297             qp[i] = GMP_NUMB_MAX;
 298         }
 299       else
 300         {
 301           /* Propagate carry into qh.  */
 302           qh = 1;
 303         }
 304     }
 305
 306   return qh;
 307 }
 308
 309 /* In case k=0 (automatic choice), we distinguish 3 cases:
 310    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
 311    (b) dn/3 < qn <= dn: in = ceil(qn / 2)
 312    (c) qn < dn/3:       in = qn
 313    In all cases we have in <= dn.
 314  */
 315 mp_size_t
 316 mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
 317 {
 318   mp_size_t in;
 319
 320   if (k == 0)
 321     {
 322       mp_size_t b;
 323       if (qn > dn)
 324         {
 325           /* Compute an inverse size that is a nice partition of the quotient.  */
 326           b = (qn - 1) / dn + 1;        /* ceil(qn/dn), number of blocks */
 327           in = (qn - 1) / b + 1;        /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 328         }
 329       else if (3 * qn > dn)
 330         {
 331           in = (qn - 1) / 2 + 1;        /* b = 2 */
 332         }
 333       else
 334         {
 335           in = (qn - 1) / 1 + 1;        /* b = 1 */
 336         }
 337     }
 338   else
 339     {
 340       mp_size_t xn;
 341       xn = MIN (dn, qn);
 342       in = (xn - 1) / k + 1;
 343     }
 344
 345   return in;
 346 }
 347
 348 mp_size_t
 349 mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
 350 {
 351   mp_size_t qn, in, itch_local, itch_out, itch_invapp;
 352
 353   qn = nn - dn;
 354   if (qn + 1 < dn)
 355     {
 356       dn = qn + 1;
 357     }
 358   in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
 359
 360   itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
 361   itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
 362   itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
 363
 364   ASSERT (dn + itch_local + itch_out >= itch_invapp);
 365   return in + MAX (dn + itch_local + itch_out, itch_invapp);
 366 }