beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / mode1o.c
blobec91da223d4880577b8b1e5dc75651ad0ffadd53
1 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST
4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5 FUTURE GNU MP RELEASES.
7 Copyright 2000-2004 Free Software Foundation, Inc.
9 This file is part of the GNU MP Library.
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
14 * the GNU Lesser General Public License as published by the Free
15 Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
20 * the GNU General Public License as published by the Free Software
21 Foundation; either version 2 of the License, or (at your option) any
22 later version.
24 or both in parallel, as here.
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 for more details.
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library. If not,
33 see https://www.gnu.org/licenses/. */
35 #include "gmp.h"
36 #include "gmp-impl.h"
37 #include "longlong.h"
40 /* Calculate an r satisfying
42 r*B^k + a - c == q*d
44 where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
45 (the caller won't know which), and q is the quotient (discarded). d must
46 be odd, c can be any limb value.
48 If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
50 This slightly strange function suits the initial Nx1 reduction for GCDs
51 or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
52 -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be
53 ignored, or for the Jacobi symbol it can be accounted for. The function
54 also suits divisibility and congruence testing since if r=0 (or r=d) is
55 obtained then a==c mod d.
58 r is a bit like the remainder returned by mpn_divexact_by3c, and is the
59 sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r
60 represents a borrow, since effectively quotient limbs are chosen so that
61 subtracting that multiple of d from src at each step will produce a zero
62 limb.
64 A long calculation can be done piece by piece from low to high by passing
65 the return value from one part as the carry parameter to the next part.
66 The effective final k becomes anything between size and size-n, if n
67 pieces are used.
70 A similar sort of routine could be constructed based on adding multiples
71 of d at each limb, much like redc in mpz_powm does. Subtracting however
72 has a small advantage that when subtracting to cancel out l there's never
73 a borrow into h, whereas using an addition would put a carry into h
74 depending whether l==0 or l!=0.
77 In terms of efficiency, this function is similar to a mul-by-inverse
78 mpn_mod_1. Both are essentially two multiplies and are best suited to
79 CPUs with low latency multipliers (in comparison to a divide instruction
80 at least.) But modexact has a few less supplementary operations, only
81 needs low part and high part multiplies, and has fewer working quantities
82 (helping CPUs with few registers).
85 In the main loop it will be noted that the new carry (call it r) is the
86 sum of the high product h and any borrow from l=s-c. If c<d then we will
87 have r<d too, for the following reasons. Let q=l*inverse be the quotient
88 limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then
90 l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
92 But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
93 never have h=d-1 and so r=h+borrow <= d-1.
95 When c>=d, on the other hand, h=d-1 can certainly occur together with a
96 borrow, thereby giving only r<=d, as per the function definition above.
98 As a design decision it's left to the caller to check for r=d if it might
99 be passing c>=d. Several applications have c<d initially so the extra
100 test is often unnecessary, for example the GCDs or a plain divisibility
101 d|a test will pass c=0.
104 The special case for size==1 is so that it can be assumed c<=d in the
105 high<=divisor test at the end. c<=d is only guaranteed after at least
106 one iteration of the main loop. There's also a decent chance one % is
107 faster than a binvert_limb, though that will depend on the processor.
109 A CPU specific implementation might want to omit the size==1 code or the
110 high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither
111 useful. */
114 mp_limb_t
115 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
116 mp_limb_t orig_c)
118 mp_limb_t s, h, l, inverse, dummy, dmul, ret;
119 mp_limb_t c = orig_c;
120 mp_size_t i;
122 ASSERT (size >= 1);
123 ASSERT (d & 1);
124 ASSERT_MPN (src, size);
125 ASSERT_LIMB (d);
126 ASSERT_LIMB (c);
128 if (size == 1)
130 s = src[0];
131 if (s > c)
133 l = s-c;
134 h = l % d;
135 if (h != 0)
136 h = d - h;
138 else
140 l = c-s;
141 h = l % d;
143 return h;
147 binvert_limb (inverse, d);
148 dmul = d << GMP_NAIL_BITS;
150 i = 0;
153 s = src[i];
154 SUBC_LIMB (c, l, s, c);
155 l = (l * inverse) & GMP_NUMB_MASK;
156 umul_ppmm (h, dummy, l, dmul);
157 c += h;
159 while (++i < size-1);
162 s = src[i];
163 if (s <= d)
165 /* With high<=d the final step can be a subtract and addback. If c==0
166 then the addback will restore to l>=0. If c==d then will get l==d
167 if s==0, but that's ok per the function definition. */
169 l = c - s;
170 if (c < s)
171 l += d;
173 ret = l;
175 else
177 /* Can't skip a divide, just do the loop code once more. */
179 SUBC_LIMB (c, l, s, c);
180 l = (l * inverse) & GMP_NUMB_MASK;
181 umul_ppmm (h, dummy, l, dmul);
182 c += h;
183 ret = c;
186 ASSERT (orig_c < d ? ret < d : ret <= d);
187 return ret;
192 #if 0
194 /* The following is an alternate form that might shave one cycle on a
195 superscalar processor since it takes c+=h off the dependent chain,
196 leaving just a low product, high product, and a subtract.
198 This is for CPU specific implementations to consider. A special case for
199 high<divisor and/or size==1 can be added if desired.
201 Notice that c is only ever 0 or 1, since if s-c produces a borrow then
202 x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become
203 c=(x==0xFF..FF) too, if that helped. */
205 mp_limb_t
206 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
208 mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2;
209 mp_limb_t c = 0;
210 mp_size_t i;
212 ASSERT (size >= 1);
213 ASSERT (d & 1);
215 binvert_limb (inverse, d);
216 dmul = d << GMP_NAIL_BITS;
218 for (i = 0; i < size; i++)
220 ASSERT (c==0 || c==1);
222 s = src[i];
223 SUBC_LIMB (c1, x, s, c);
225 SUBC_LIMB (c2, y, x, h);
226 c = c1 + c2;
228 y = (y * inverse) & GMP_NUMB_MASK;
229 umul_ppmm (h, dummy, y, dmul);
232 h += c;
233 return h;
236 #endif