beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / generic / sqrtrem.c
blob4835e8e59545efe44faf41a8e7b63a774fe2bd28
1 /* mpn_sqrtrem -- square root and remainder
3 Contributed to the GNU project by Paul Zimmermann (most code),
4 Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
6 THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
7 MUTABLE INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
8 INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
9 DISAPPEAR IN A FUTURE GMP RELEASE.
11 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015 Free Software
12 Foundation, Inc.
14 This file is part of the GNU MP Library.
16 The GNU MP Library is free software; you can redistribute it and/or modify
17 it under the terms of either:
19 * the GNU Lesser General Public License as published by the Free
20 Software Foundation; either version 3 of the License, or (at your
21 option) any later version.
25 * the GNU General Public License as published by the Free Software
26 Foundation; either version 2 of the License, or (at your option) any
27 later version.
29 or both in parallel, as here.
31 The GNU MP Library is distributed in the hope that it will be useful, but
32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
33 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
34 for more details.
36 You should have received copies of the GNU General Public License and the
37 GNU Lesser General Public License along with the GNU MP Library. If not,
38 see https://www.gnu.org/licenses/. */
41 /* See "Karatsuba Square Root", reference in gmp.texi. */
44 #include <stdio.h>
45 #include <stdlib.h>
47 #include "gmp.h"
48 #include "gmp-impl.h"
49 #include "longlong.h"
50 #define USE_DIVAPPR_Q 1
51 #define TRACE(x)
53 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
55 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
56 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
57 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
58 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
59 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
60 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
61 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
62 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
63 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
64 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
65 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
66 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
67 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
68 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
69 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
70 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
71 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
72 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
73 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
74 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
75 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
76 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
77 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
78 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
79 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
80 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
81 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
82 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
83 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
84 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
85 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
86 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
87 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
88 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
89 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
90 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
91 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
92 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
93 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
94 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
95 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
96 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
97 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
98 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
99 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
100 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
101 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
102 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */
105 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */
107 #if GMP_NUMB_BITS > 32
108 #define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
109 #else
110 #define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */
111 #endif
113 static mp_limb_t
114 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
116 #if GMP_NUMB_BITS > 32
117 mp_limb_t a1;
118 #endif
119 mp_limb_t x0, t2, t, x2;
120 unsigned abits;
122 ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
123 ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
124 ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
126 /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
127 since we can do the former without division. As part of the last
128 iteration convert from 1/sqrt(a) to sqrt(a). */
130 abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */
131 x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */
133 /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
135 #if GMP_NUMB_BITS > 32
136 a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
137 t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
138 x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
140 /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
142 t2 = x0 * (a0 >> (32-8));
143 t = t2 >> 25;
144 t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
145 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
146 x0 >>= 32;
147 #else
148 t2 = x0 * (a0 >> (16-8));
149 t = t2 >> 13;
150 t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
151 x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
152 x0 >>= 16;
153 #endif
155 /* x0 is now a full limb approximation of sqrt(a0) */
157 x2 = x0 * x0;
158 if (x2 + 2*x0 <= a0 - 1)
160 x2 += 2*x0 + 1;
161 x0++;
164 *rp = a0 - x2;
165 return x0;
169 #define Prec (GMP_NUMB_BITS >> 1)
171 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
172 return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
173 static mp_limb_t
174 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
176 mp_limb_t q, u, np0, sp0, rp0, q2;
177 int cc;
179 ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
181 np0 = np[0];
182 sp0 = mpn_sqrtrem1 (rp, np[1]);
183 rp0 = rp[0];
184 /* rp0 <= 2*sp0 < 2^(Prec + 1) */
185 rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
186 q = rp0 / sp0;
187 /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
188 q -= q >> Prec;
189 /* now we have q < 2^Prec */
190 u = rp0 - q * sp0;
191 /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
192 sp0 = (sp0 << Prec) | q;
193 cc = u >> (Prec - 1);
194 rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
195 /* subtract q * q from rp */
196 q2 = q * q;
197 cc -= rp0 < q2;
198 rp0 -= q2;
199 if (cc < 0)
201 rp0 += sp0;
202 cc += rp0 < sp0;
203 --sp0;
204 rp0 += sp0;
205 cc += rp0 < sp0;
208 rp[0] = rp0;
209 sp[0] = sp0;
210 return cc;
213 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
214 and in {np, n} the low n limbs of the remainder, returns the high
215 limb of the remainder (which is 0 or 1).
216 Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
217 where B=2^GMP_NUMB_BITS.
218 Needs a scratch of n/2+1 limbs. */
219 static mp_limb_t
220 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
222 mp_limb_t q; /* carry out of {sp, n} */
223 int c, b; /* carry out of remainder */
224 mp_size_t l, h;
226 ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
228 if (n == 1)
229 c = mpn_sqrtrem2 (sp, np, np);
230 else
232 l = n / 2;
233 h = n - l;
234 q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
235 if (q != 0)
236 ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
237 TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
238 mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
239 q += scratch[l];
240 c = scratch[0] & 1;
241 mpn_rshift (sp, scratch, l, 1);
242 sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
243 if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
244 return 1; /* Remainder is non-zero */
245 q >>= 1;
246 if (c != 0)
247 c = mpn_add_n (np + l, np + l, sp + l, h);
248 TRACE(printf("sqr(,,%u)\n", (unsigned) l));
249 mpn_sqr (np + n, sp, l);
250 b = q + mpn_sub_n (np, np, np + n, 2 * l);
251 c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
253 if (c < 0)
255 q = mpn_add_1 (sp + l, sp + l, h, q);
256 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
257 c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
258 #else
259 c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
260 #endif
261 c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
262 q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
266 return c;
269 #if USE_DIVAPPR_Q
270 static void
271 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
273 gmp_pi1_t inv;
274 mp_limb_t qh;
275 ASSERT (dn > 2);
276 ASSERT (nn >= dn);
277 ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
279 MPN_COPY (scratch, np, nn);
280 invert_pi1 (inv, dp[dn-1], dp[dn-2]);
281 if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
282 qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
283 else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
284 qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
285 else
287 mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
288 TMP_DECL;
289 TMP_MARK;
290 /* Sadly, scratch is too small. */
291 qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
292 TMP_FREE;
294 qp [nn - dn] = qh;
296 #endif
298 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
299 returns zero if the operand was a perfect square, one otherwise.
300 Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
301 where B=2^GMP_NUMB_BITS.
302 THINK: In the odd case, three more (dummy) limbs are taken into account,
303 when nsh is maximal, two limbs are discarded from the result of the
304 division. Too much? Is a single dummy limb enough? */
305 static int
306 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
308 mp_limb_t q; /* carry out of {sp, n} */
309 int c; /* carry out of remainder */
310 mp_size_t l, h;
311 mp_ptr qp, tp, scratch;
312 TMP_DECL;
313 TMP_MARK;
315 ASSERT (np[2 * n - 1 - odd] != 0);
316 ASSERT (n > 4);
317 ASSERT (nsh < GMP_NUMB_BITS / 2);
319 l = (n - 1) / 2;
320 h = n - l;
321 ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
322 scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
323 tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
324 if (nsh != 0)
326 /* o is used to exactly set the lowest bits of the dividend, is it needed? */
327 int o = l > (1 + odd);
328 ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
330 else
331 MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
332 q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
333 if (q != 0)
334 ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
335 qp = tp + n + 1; /* l + 2 */
336 TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
337 #if USE_DIVAPPR_Q
338 mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
339 #else
340 mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
341 #endif
342 q += qp [l + 1];
343 c = 1;
344 if (q > 1)
346 /* FIXME: if s!=0 we will shift later, a noop on this area. */
347 MPN_FILL (sp, l, GMP_NUMB_MAX);
349 else
351 /* FIXME: if s!=0 we will shift again later, shift just once. */
352 mpn_rshift (sp, qp + 1, l, 1);
353 sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
354 if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
355 (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
357 mp_limb_t cy;
358 /* Approximation is not good enough, the extra limb(+ nsh bits)
359 is smaller than needed to absorb the possible error. */
360 /* {qp + 1, l + 1} equals 2*{sp, l} */
361 /* FIXME: use mullo or wrap-around, or directly evaluate
362 remainder with a single sqrmod_bnm1. */
363 TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
364 ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
365 /* Compute the remainder of the previous mpn_div(appr)_q. */
366 cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
367 #if USE_DIVAPPR_Q || WANT_ASSERT
368 MPN_DECR_U (tp + 1 + h, l, cy);
369 #if USE_DIVAPPR_Q
370 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
371 if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
373 /* May happen only if div result was not exact. */
374 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
375 cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
376 #else
377 cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
378 #endif
379 ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
380 MPN_DECR_U (sp, l, 1);
382 /* Can the root be exact when a correction was needed? We
383 did not find an example, but it depends on divappr
384 internals, and we can not assume it true in general...*/
385 /* else */
386 #else /* WANT_ASSERT */
387 ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
388 #endif
389 #endif
390 if (mpn_zero_p (tp + l + 1, h - l))
392 TRACE(printf("sqr(,,%u)\n", (unsigned) l));
393 mpn_sqr (scratch, sp, l);
394 c = mpn_cmp (tp + 1, scratch + l, l);
395 if (c == 0)
397 if (nsh != 0)
399 mpn_lshift (tp, np, l, 2 * nsh);
400 np = tp;
402 c = mpn_cmp (np, scratch + odd, l - odd);
404 if (c < 0)
406 MPN_DECR_U (sp, l, 1);
407 c = 1;
412 TMP_FREE;
414 if ((odd | nsh) != 0)
415 mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
416 return c;
420 mp_size_t
421 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
423 mp_limb_t *tp, s0[1], cc, high, rl;
424 int c;
425 mp_size_t rn, tn;
426 TMP_DECL;
428 ASSERT (nn > 0);
429 ASSERT_MPN (np, nn);
431 ASSERT (np[nn - 1] != 0);
432 ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
433 ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
434 ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
436 high = np[nn - 1];
437 if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
438 c = 0;
439 else
441 count_leading_zeros (c, high);
442 c -= GMP_NAIL_BITS;
444 c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
446 if (nn == 1) {
447 if (c == 0)
449 sp[0] = mpn_sqrtrem1 (&rl, high);
450 if (rp != NULL)
451 rp[0] = rl;
453 else
455 cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
456 sp[0] = cc;
457 if (rp != NULL)
458 rp[0] = rl = high - cc*cc;
460 return rl != 0;
462 tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
464 if ((rp == NULL) && (nn > 8))
465 return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
466 TMP_MARK;
467 if (((nn & 1) | c) != 0)
469 mp_limb_t mask;
470 mp_ptr scratch;
471 TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
472 tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */
473 if (c != 0)
474 mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
475 else
476 MPN_COPY (tp + (nn & 1), np, nn);
477 c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */
478 mask = (CNST_LIMB (1) << c) - 1;
479 rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
480 /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
481 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
482 s0[0] = sp[0] & mask; /* S mod 2^k */
483 rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */
484 cc = mpn_submul_1 (tp, s0, 1, s0[0]);
485 rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
486 mpn_rshift (sp, sp, tn, c);
487 tp[tn] = rl;
488 if (rp == NULL)
489 rp = tp;
490 c = c << 1;
491 if (c < GMP_NUMB_BITS)
492 tn++;
493 else
495 tp++;
496 c -= GMP_NUMB_BITS;
498 if (c != 0)
499 mpn_rshift (rp, tp, tn, c);
500 else
501 MPN_COPY_INCR (rp, tp, tn);
502 rn = tn;
504 else
506 if (rp != np)
508 if (rp == NULL) /* nn <= 8 */
509 rp = TMP_SALLOC_LIMBS (nn);
510 MPN_COPY (rp, np, nn);
512 rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
515 MPN_NORMALIZE (rp, rn);
517 TMP_FREE;
518 return rn;