source/libs/gmp/gmp-src/mpn/generic/mode1o.c

   1 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
   2
   3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   5    FUTURE GNU MP RELEASES.
   6
   7 Copyright 2000-2004 Free Software Foundation, Inc.
   8
   9 This file is part of the GNU MP Library.
  10
  11 The GNU MP Library is free software; you can redistribute it and/or modify
  12 it under the terms of either:
  13
  14   * the GNU Lesser General Public License as published by the Free
  15     Software Foundation; either version 3 of the License, or (at your
  16     option) any later version.
  17
  18 or
  19
  20   * the GNU General Public License as published by the Free Software
  21     Foundation; either version 2 of the License, or (at your option) any
  22     later version.
  23
  24 or both in parallel, as here.
  25
  26 The GNU MP Library is distributed in the hope that it will be useful, but
  27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 for more details.
  30
  31 You should have received copies of the GNU General Public License and the
  32 GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 see https://www.gnu.org/licenses/.  */
  34
  35 #include "gmp.h"
  36 #include "gmp-impl.h"
  37 #include "longlong.h"
  38
  39
  40 /* Calculate an r satisfying
  41
  42            r*B^k + a - c == q*d
  43
  44    where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
  45    (the caller won't know which), and q is the quotient (discarded).  d must
  46    be odd, c can be any limb value.
  47
  48    If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
  49
  50    This slightly strange function suits the initial Nx1 reduction for GCDs
  51    or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
  52    -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
  53    ignored, or for the Jacobi symbol it can be accounted for.  The function
  54    also suits divisibility and congruence testing since if r=0 (or r=d) is
  55    obtained then a==c mod d.
  56
  57
  58    r is a bit like the remainder returned by mpn_divexact_by3c, and is the
  59    sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
  60    represents a borrow, since effectively quotient limbs are chosen so that
  61    subtracting that multiple of d from src at each step will produce a zero
  62    limb.
  63
  64    A long calculation can be done piece by piece from low to high by passing
  65    the return value from one part as the carry parameter to the next part.
  66    The effective final k becomes anything between size and size-n, if n
  67    pieces are used.
  68
  69
  70    A similar sort of routine could be constructed based on adding multiples
  71    of d at each limb, much like redc in mpz_powm does.  Subtracting however
  72    has a small advantage that when subtracting to cancel out l there's never
  73    a borrow into h, whereas using an addition would put a carry into h
  74    depending whether l==0 or l!=0.
  75
  76
  77    In terms of efficiency, this function is similar to a mul-by-inverse
  78    mpn_mod_1.  Both are essentially two multiplies and are best suited to
  79    CPUs with low latency multipliers (in comparison to a divide instruction
  80    at least.)  But modexact has a few less supplementary operations, only
  81    needs low part and high part multiplies, and has fewer working quantities
  82    (helping CPUs with few registers).
  83
  84
  85    In the main loop it will be noted that the new carry (call it r) is the
  86    sum of the high product h and any borrow from l=s-c.  If c<d then we will
  87    have r<d too, for the following reasons.  Let q=l*inverse be the quotient
  88    limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
  89
  90        l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
  91
  92    But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
  93    never have h=d-1 and so r=h+borrow <= d-1.
  94
  95    When c>=d, on the other hand, h=d-1 can certainly occur together with a
  96    borrow, thereby giving only r<=d, as per the function definition above.
  97
  98    As a design decision it's left to the caller to check for r=d if it might
  99    be passing c>=d.  Several applications have c<d initially so the extra
 100    test is often unnecessary, for example the GCDs or a plain divisibility
 101    d|a test will pass c=0.
 102
 103
 104    The special case for size==1 is so that it can be assumed c<=d in the
 105    high<=divisor test at the end.  c<=d is only guaranteed after at least
 106    one iteration of the main loop.  There's also a decent chance one % is
 107    faster than a binvert_limb, though that will depend on the processor.
 108
 109    A CPU specific implementation might want to omit the size==1 code or the
 110    high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
 111    useful.  */
 112
 113
 114 mp_limb_t
 115 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
 116                      mp_limb_t orig_c)
 117 {
 118   mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
 119   mp_limb_t  c = orig_c;
 120   mp_size_t  i;
 121
 122   ASSERT (size >= 1);
 123   ASSERT (d & 1);
 124   ASSERT_MPN (src, size);
 125   ASSERT_LIMB (d);
 126   ASSERT_LIMB (c);
 127
 128   if (size == 1)
 129     {
 130       s = src[0];
 131       if (s > c)
 132         {
 133           l = s-c;
 134           h = l % d;
 135           if (h != 0)
 136             h = d - h;
 137         }
 138       else
 139         {
 140           l = c-s;
 141           h = l % d;
 142         }
 143       return h;
 144     }
 145
 146
 147   binvert_limb (inverse, d);
 148   dmul = d << GMP_NAIL_BITS;
 149
 150   i = 0;
 151   do
 152     {
 153       s = src[i];
 154       SUBC_LIMB (c, l, s, c);
 155       l = (l * inverse) & GMP_NUMB_MASK;
 156       umul_ppmm (h, dummy, l, dmul);
 157       c += h;
 158     }
 159   while (++i < size-1);
 160
 161
 162   s = src[i];
 163   if (s <= d)
 164     {
 165       /* With high<=d the final step can be a subtract and addback.  If c==0
 166          then the addback will restore to l>=0.  If c==d then will get l==d
 167          if s==0, but that's ok per the function definition.  */
 168
 169       l = c - s;
 170       if (c < s)
 171         l += d;
 172
 173       ret = l;
 174     }
 175   else
 176     {
 177       /* Can't skip a divide, just do the loop code once more. */
 178
 179       SUBC_LIMB (c, l, s, c);
 180       l = (l * inverse) & GMP_NUMB_MASK;
 181       umul_ppmm (h, dummy, l, dmul);
 182       c += h;
 183       ret = c;
 184     }
 185
 186   ASSERT (orig_c < d ? ret < d : ret <= d);
 187   return ret;
 188 }
 189
 190
 191
 192 #if 0
 193
 194 /* The following is an alternate form that might shave one cycle on a
 195    superscalar processor since it takes c+=h off the dependent chain,
 196    leaving just a low product, high product, and a subtract.
 197
 198    This is for CPU specific implementations to consider.  A special case for
 199    high<divisor and/or size==1 can be added if desired.
 200
 201    Notice that c is only ever 0 or 1, since if s-c produces a borrow then
 202    x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
 203    c=(x==0xFF..FF) too, if that helped.  */
 204
 205 mp_limb_t
 206 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
 207 {
 208   mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
 209   mp_limb_t  c = 0;
 210   mp_size_t  i;
 211
 212   ASSERT (size >= 1);
 213   ASSERT (d & 1);
 214
 215   binvert_limb (inverse, d);
 216   dmul = d << GMP_NAIL_BITS;
 217
 218   for (i = 0; i < size; i++)
 219     {
 220       ASSERT (c==0 || c==1);
 221
 222       s = src[i];
 223       SUBC_LIMB (c1, x, s, c);
 224
 225       SUBC_LIMB (c2, y, x, h);
 226       c = c1 + c2;
 227
 228       y = (y * inverse) & GMP_NUMB_MASK;
 229       umul_ppmm (h, dummy, y, dmul);
 230     }
 231
 232   h += c;
 233   return h;
 234 }
 235
 236 #endif