source/libs/gmp/gmp-src/mpn/generic/mu_bdiv_q.c

   1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
   2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
   3    overlap disallowed.
   4
   5    Contributed to the GNU project by Torbjorn Granlund.
   6
   7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
  10
  11 Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
  12
  13 This file is part of the GNU MP Library.
  14
  15 The GNU MP Library is free software; you can redistribute it and/or modify
  16 it under the terms of either:
  17
  18   * the GNU Lesser General Public License as published by the Free
  19     Software Foundation; either version 3 of the License, or (at your
  20     option) any later version.
  21
  22 or
  23
  24   * the GNU General Public License as published by the Free Software
  25     Foundation; either version 2 of the License, or (at your option) any
  26     later version.
  27
  28 or both in parallel, as here.
  29
  30 The GNU MP Library is distributed in the hope that it will be useful, but
  31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  33 for more details.
  34
  35 You should have received copies of the GNU General Public License and the
  36 GNU Lesser General Public License along with the GNU MP Library.  If not,
  37 see https://www.gnu.org/licenses/.  */
  38
  39
  40 /*
  41    The idea of the algorithm used herein is to compute a smaller inverted value
  42    than used in the standard Barrett algorithm, and thus save time in the
  43    Newton iterations, and pay just a small price when using the inverted value
  44    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  45 */
  46
  47 #include "gmp.h"
  48 #include "gmp-impl.h"
  49
  50
  51 /* N = {np,nn}
  52    D = {dp,dn}
  53
  54    Requirements: N >= D
  55                  D >= 1
  56                  D odd
  57                  dn >= 2
  58                  nn >= 2
  59                  scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
  60
  61    Write quotient to Q = {qp,nn}.
  62
  63    FIXME: When iterating, perhaps do the small step before loop, not after.
  64    FIXME: Try to avoid the scalar divisions when computing inverse size.
  65    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
  66           particular, when dn==in, tp and rp could use the same space.
  67    FIXME: Trim final quotient calculation to qn limbs of precision.
  68 */
  69 void
  70 mpn_mu_bdiv_q (mp_ptr qp,
  71                mp_srcptr np, mp_size_t nn,
  72                mp_srcptr dp, mp_size_t dn,
  73                mp_ptr scratch)
  74 {
  75   mp_size_t qn;
  76   mp_size_t in;
  77   int cy, c0;
  78   mp_size_t tn, wn;
  79
  80   qn = nn;
  81
  82   ASSERT (dn >= 2);
  83   ASSERT (qn >= 2);
  84
  85   if (qn > dn)
  86     {
  87       mp_size_t b;
  88
  89       /* |_______________________|   dividend
  90                         |________|   divisor  */
  91
  92 #define ip           scratch                    /* in */
  93 #define rp           (scratch + in)             /* dn or rest >= binvert_itch(in) */
  94 #define tp           (scratch + in + dn)        /* dn+in or next_size(dn) */
  95 #define scratch_out  (scratch + in + dn + tn)   /* mulmod_bnm1_itch(next_size(dn)) */
  96
  97       /* Compute an inverse size that is a nice partition of the quotient.  */
  98       b = (qn - 1) / dn + 1;    /* ceil(qn/dn), number of blocks */
  99       in = (qn - 1) / b + 1;    /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 100
 101       /* Some notes on allocation:
 102
 103          When in = dn, R dies when mpn_mullo returns, if in < dn the low in
 104          limbs of R dies at that point.  We could save memory by letting T live
 105          just under R, and let the upper part of T expand into R. These changes
 106          should reduce itch to perhaps 3dn.
 107        */
 108
 109       mpn_binvert (ip, dp, in, rp);
 110
 111       cy = 0;
 112
 113       MPN_COPY (rp, np, dn);
 114       np += dn;
 115       mpn_mullo_n (qp, rp, ip, in);
 116       qn -= in;
 117
 118       while (qn > in)
 119         {
 120           if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 121             mpn_mul (tp, dp, dn, qp, in);       /* mulhi, need tp[dn+in-1...in] */
 122           else
 123             {
 124               tn = mpn_mulmod_bnm1_next_size (dn);
 125               mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 126               wn = dn + in - tn;                /* number of wrapped limbs */
 127               if (wn > 0)
 128                 {
 129                   c0 = mpn_sub_n (tp + tn, tp, rp, wn);
 130                   mpn_decr_u (tp + wn, c0);
 131                 }
 132             }
 133
 134           qp += in;
 135           if (dn != in)
 136             {
 137               /* Subtract tp[dn-1...in] from partial remainder.  */
 138               cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
 139               if (cy == 2)
 140                 {
 141                   mpn_incr_u (tp + dn, 1);
 142                   cy = 1;
 143                 }
 144             }
 145           /* Subtract tp[dn+in-1...dn] from dividend.  */
 146           cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
 147           np += in;
 148           mpn_mullo_n (qp, rp, ip, in);
 149           qn -= in;
 150         }
 151
 152       /* Generate last qn limbs.
 153          FIXME: It should be possible to limit precision here, since qn is
 154          typically somewhat smaller than dn.  No big gains expected.  */
 155
 156       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 157         mpn_mul (tp, dp, dn, qp, in);           /* mulhi, need tp[qn+in-1...in] */
 158       else
 159         {
 160           tn = mpn_mulmod_bnm1_next_size (dn);
 161           mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 162           wn = dn + in - tn;                    /* number of wrapped limbs */
 163           if (wn > 0)
 164             {
 165               c0 = mpn_sub_n (tp + tn, tp, rp, wn);
 166               mpn_decr_u (tp + wn, c0);
 167             }
 168         }
 169
 170       qp += in;
 171       if (dn != in)
 172         {
 173           cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
 174           if (cy == 2)
 175             {
 176               mpn_incr_u (tp + dn, 1);
 177               cy = 1;
 178             }
 179         }
 180
 181       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
 182       mpn_mullo_n (qp, rp, ip, qn);
 183
 184 #undef ip
 185 #undef rp
 186 #undef tp
 187 #undef scratch_out
 188    }
 189   else
 190     {
 191       /* |_______________________|   dividend
 192                 |________________|   divisor  */
 193
 194 #define ip           scratch            /* in */
 195 #define tp           (scratch + in)     /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
 196 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
 197
 198       /* Compute half-sized inverse.  */
 199       in = qn - (qn >> 1);
 200
 201       mpn_binvert (ip, dp, in, tp);
 202
 203       mpn_mullo_n (qp, np, ip, in);             /* low `in' quotient limbs */
 204
 205       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 206         mpn_mul (tp, dp, qn, qp, in);           /* mulhigh */
 207       else
 208         {
 209           tn = mpn_mulmod_bnm1_next_size (qn);
 210           mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
 211           wn = qn + in - tn;                    /* number of wrapped limbs */
 212           if (wn > 0)
 213             {
 214               c0 = mpn_cmp (tp, np, wn) < 0;
 215               mpn_decr_u (tp + wn, c0);
 216             }
 217         }
 218
 219       mpn_sub_n (tp, np + in, tp + in, qn - in);
 220       mpn_mullo_n (qp + in, tp, ip, qn - in);   /* high qn-in quotient limbs */
 221
 222 #undef ip
 223 #undef tp
 224 #undef scratch_out
 225     }
 226 }
 227
 228 mp_size_t
 229 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
 230 {
 231   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
 232   mp_size_t b;
 233
 234   ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
 235
 236   qn = nn;
 237
 238   if (qn > dn)
 239     {
 240       b = (qn - 1) / dn + 1;    /* ceil(qn/dn), number of blocks */
 241       in = (qn - 1) / b + 1;    /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 242       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 243         {
 244           tn = dn + in;
 245           itch_out = 0;
 246         }
 247       else
 248         {
 249           tn = mpn_mulmod_bnm1_next_size (dn);
 250           itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
 251         }
 252       itches = dn + tn + itch_out;
 253     }
 254   else
 255     {
 256       in = qn - (qn >> 1);
 257       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 258         {
 259           tn = qn + in;
 260           itch_out = 0;
 261         }
 262       else
 263         {
 264           tn = mpn_mulmod_bnm1_next_size (qn);
 265           itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
 266         }
 267       itches = tn + itch_out;
 268     }
 269
 270   itch_binvert = mpn_binvert_itch (in);
 271   return in + MAX (itches, itch_binvert);
 272 }
 273