source/libs/gmp/gmp-src/mpn/generic/sqrtrem.c

   1 /* mpn_sqrtrem -- square root and remainder
   2
   3    Contributed to the GNU project by Paul Zimmermann (most code),
   4    Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
   5
   6    THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
   7    MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
   8    INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
   9    DISAPPEAR IN A FUTURE GMP RELEASE.
  10
  11 Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015 Free Software
  12 Foundation, Inc.
  13
  14 This file is part of the GNU MP Library.
  15
  16 The GNU MP Library is free software; you can redistribute it and/or modify
  17 it under the terms of either:
  18
  19   * the GNU Lesser General Public License as published by the Free
  20     Software Foundation; either version 3 of the License, or (at your
  21     option) any later version.
  22
  23 or
  24
  25   * the GNU General Public License as published by the Free Software
  26     Foundation; either version 2 of the License, or (at your option) any
  27     later version.
  28
  29 or both in parallel, as here.
  30
  31 The GNU MP Library is distributed in the hope that it will be useful, but
  32 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  33 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  34 for more details.
  35
  36 You should have received copies of the GNU General Public License and the
  37 GNU Lesser General Public License along with the GNU MP Library.  If not,
  38 see https://www.gnu.org/licenses/.  */
  39
  40
  41 /* See "Karatsuba Square Root", reference in gmp.texi.  */
  42
  43
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46
  47 #include "gmp.h"
  48 #include "gmp-impl.h"
  49 #include "longlong.h"
  50 #define USE_DIVAPPR_Q 1
  51 #define TRACE(x)
  52
  53 static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
  54 {
  55   0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
  56   0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
  57   0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
  58   0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
  59   0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
  60   0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
  61   0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
  62   0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
  63   0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
  64   0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
  65   0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
  66   0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
  67   0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
  68   0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
  69   0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
  70   0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
  71   0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
  72   0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
  73   0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
  74   0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
  75   0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
  76   0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
  77   0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
  78   0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
  79   0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
  80   0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
  81   0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
  82   0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
  83   0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
  84   0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
  85   0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
  86   0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
  87   0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
  88   0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
  89   0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
  90   0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
  91   0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
  92   0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
  93   0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
  94   0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
  95   0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
  96   0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
  97   0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
  98   0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
  99   0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
 100   0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
 101   0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
 102   0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
 103 };
 104
 105 /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
 106
 107 #if GMP_NUMB_BITS > 32
 108 #define MAGIC CNST_LIMB(0x10000000000)  /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
 109 #else
 110 #define MAGIC CNST_LIMB(0x100000)               /* 0xfee6f < MAGIC < 0x29cbc8 */
 111 #endif
 112
 113 static mp_limb_t
 114 mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
 115 {
 116 #if GMP_NUMB_BITS > 32
 117   mp_limb_t a1;
 118 #endif
 119   mp_limb_t x0, t2, t, x2;
 120   unsigned abits;
 121
 122   ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
 123   ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
 124   ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
 125
 126   /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
 127      since we can do the former without division.  As part of the last
 128      iteration convert from 1/sqrt(a) to sqrt(a).  */
 129
 130   abits = a0 >> (GMP_LIMB_BITS - 1 - 8);        /* extract bits for table lookup */
 131   x0 = 0x100 | invsqrttab[abits - 0x80];        /* initial 1/sqrt(a) */
 132
 133   /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
 134
 135 #if GMP_NUMB_BITS > 32
 136   a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
 137   t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
 138   x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
 139
 140   /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
 141
 142   t2 = x0 * (a0 >> (32-8));
 143   t = t2 >> 25;
 144   t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
 145   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
 146   x0 >>= 32;
 147 #else
 148   t2 = x0 * (a0 >> (16-8));
 149   t = t2 >> 13;
 150   t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
 151   x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
 152   x0 >>= 16;
 153 #endif
 154
 155   /* x0 is now a full limb approximation of sqrt(a0) */
 156
 157   x2 = x0 * x0;
 158   if (x2 + 2*x0 <= a0 - 1)
 159     {
 160       x2 += 2*x0 + 1;
 161       x0++;
 162     }
 163
 164   *rp = a0 - x2;
 165   return x0;
 166 }
 167
 168
 169 #define Prec (GMP_NUMB_BITS >> 1)
 170
 171 /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
 172    return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
 173 static mp_limb_t
 174 mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
 175 {
 176   mp_limb_t q, u, np0, sp0, rp0, q2;
 177   int cc;
 178
 179   ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
 180
 181   np0 = np[0];
 182   sp0 = mpn_sqrtrem1 (rp, np[1]);
 183   rp0 = rp[0];
 184   /* rp0 <= 2*sp0 < 2^(Prec + 1) */
 185   rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
 186   q = rp0 / sp0;
 187   /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
 188   q -= q >> Prec;
 189   /* now we have q < 2^Prec */
 190   u = rp0 - q * sp0;
 191   /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
 192   sp0 = (sp0 << Prec) | q;
 193   cc = u >> (Prec - 1);
 194   rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
 195   /* subtract q * q from rp */
 196   q2 = q * q;
 197   cc -= rp0 < q2;
 198   rp0 -= q2;
 199   if (cc < 0)
 200     {
 201       rp0 += sp0;
 202       cc += rp0 < sp0;
 203       --sp0;
 204       rp0 += sp0;
 205       cc += rp0 < sp0;
 206     }
 207
 208   rp[0] = rp0;
 209   sp[0] = sp0;
 210   return cc;
 211 }
 212
 213 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
 214    and in {np, n} the low n limbs of the remainder, returns the high
 215    limb of the remainder (which is 0 or 1).
 216    Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
 217    where B=2^GMP_NUMB_BITS.
 218    Needs a scratch of n/2+1 limbs. */
 219 static mp_limb_t
 220 mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
 221 {
 222   mp_limb_t q;                  /* carry out of {sp, n} */
 223   int c, b;                     /* carry out of remainder */
 224   mp_size_t l, h;
 225
 226   ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
 227
 228   if (n == 1)
 229     c = mpn_sqrtrem2 (sp, np, np);
 230   else
 231     {
 232       l = n / 2;
 233       h = n - l;
 234       q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
 235       if (q != 0)
 236         ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
 237       TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
 238       mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
 239       q += scratch[l];
 240       c = scratch[0] & 1;
 241       mpn_rshift (sp, scratch, l, 1);
 242       sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
 243       if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
 244         return 1; /* Remainder is non-zero */
 245       q >>= 1;
 246       if (c != 0)
 247         c = mpn_add_n (np + l, np + l, sp + l, h);
 248       TRACE(printf("sqr(,,%u)\n", (unsigned) l));
 249       mpn_sqr (np + n, sp, l);
 250       b = q + mpn_sub_n (np, np, np + n, 2 * l);
 251       c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
 252
 253       if (c < 0)
 254         {
 255           q = mpn_add_1 (sp + l, sp + l, h, q);
 256 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
 257           c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
 258 #else
 259           c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
 260 #endif
 261           c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
 262           q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
 263         }
 264     }
 265
 266   return c;
 267 }
 268
 269 #if USE_DIVAPPR_Q
 270 static void
 271 mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
 272 {
 273   gmp_pi1_t inv;
 274   mp_limb_t qh;
 275   ASSERT (dn > 2);
 276   ASSERT (nn >= dn);
 277   ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
 278
 279   MPN_COPY (scratch, np, nn);
 280   invert_pi1 (inv, dp[dn-1], dp[dn-2]);
 281   if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
 282     qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
 283   else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
 284     qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
 285   else
 286     {
 287       mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
 288       TMP_DECL;
 289       TMP_MARK;
 290       /* Sadly, scratch is too small. */
 291       qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
 292       TMP_FREE;
 293     }
 294   qp [nn - dn] = qh;
 295 }
 296 #endif
 297
 298 /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
 299    returns zero if the operand was a perfect square, one otherwise.
 300    Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
 301    where B=2^GMP_NUMB_BITS.
 302    THINK: In the odd case, three more (dummy) limbs are taken into account,
 303    when nsh is maximal, two limbs are discarded from the result of the
 304    division. Too much? Is a single dummy limb enough? */
 305 static int
 306 mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
 307 {
 308   mp_limb_t q;                  /* carry out of {sp, n} */
 309   int c;                        /* carry out of remainder */
 310   mp_size_t l, h;
 311   mp_ptr qp, tp, scratch;
 312   TMP_DECL;
 313   TMP_MARK;
 314
 315   ASSERT (np[2 * n - 1 - odd] != 0);
 316   ASSERT (n > 4);
 317   ASSERT (nsh < GMP_NUMB_BITS / 2);
 318
 319   l = (n - 1) / 2;
 320   h = n - l;
 321   ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
 322   scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
 323   tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
 324   if (nsh != 0)
 325     {
 326       /* o is used to exactly set the lowest bits of the dividend, is it needed? */
 327       int o = l > (1 + odd);
 328       ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
 329     }
 330   else
 331     MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
 332   q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
 333   if (q != 0)
 334     ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
 335   qp = tp + n + 1; /* l + 2 */
 336   TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
 337 #if USE_DIVAPPR_Q
 338   mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
 339 #else
 340   mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
 341 #endif
 342   q += qp [l + 1];
 343   c = 1;
 344   if (q > 1)
 345     {
 346       /* FIXME: if s!=0 we will shift later, a noop on this area. */
 347       MPN_FILL (sp, l, GMP_NUMB_MAX);
 348     }
 349   else
 350     {
 351       /* FIXME: if s!=0 we will shift again later, shift just once. */
 352       mpn_rshift (sp, qp + 1, l, 1);
 353       sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
 354       if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
 355            (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
 356         {
 357           mp_limb_t cy;
 358           /* Approximation is not good enough, the extra limb(+ nsh bits)
 359              is smaller than needed to absorb the possible error. */
 360           /* {qp + 1, l + 1} equals 2*{sp, l} */
 361           /* FIXME: use mullo or wrap-around, or directly evaluate
 362              remainder with a single sqrmod_bnm1. */
 363           TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
 364           ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
 365           /* Compute the remainder of the previous mpn_div(appr)_q. */
 366           cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
 367 #if USE_DIVAPPR_Q || WANT_ASSERT
 368           MPN_DECR_U (tp + 1 + h, l, cy);
 369 #if USE_DIVAPPR_Q
 370           ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
 371           if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
 372             {
 373               /* May happen only if div result was not exact. */
 374 #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
 375               cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
 376 #else
 377               cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
 378 #endif
 379               ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
 380               MPN_DECR_U (sp, l, 1);
 381             }
 382           /* Can the root be exact when a correction was needed? We
 383              did not find an example, but it depends on divappr
 384              internals, and we can not assume it true in general...*/
 385           /* else */
 386 #else /* WANT_ASSERT */
 387           ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
 388 #endif
 389 #endif
 390           if (mpn_zero_p (tp + l + 1, h - l))
 391             {
 392               TRACE(printf("sqr(,,%u)\n", (unsigned) l));
 393               mpn_sqr (scratch, sp, l);
 394               c = mpn_cmp (tp + 1, scratch + l, l);
 395               if (c == 0)
 396                 {
 397                   if (nsh != 0)
 398                     {
 399                       mpn_lshift (tp, np, l, 2 * nsh);
 400                       np = tp;
 401                     }
 402                   c = mpn_cmp (np, scratch + odd, l - odd);
 403                 }
 404               if (c < 0)
 405                 {
 406                   MPN_DECR_U (sp, l, 1);
 407                   c = 1;
 408                 }
 409             }
 410         }
 411     }
 412   TMP_FREE;
 413
 414   if ((odd | nsh) != 0)
 415     mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
 416   return c;
 417 }
 418
 419
 420 mp_size_t
 421 mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
 422 {
 423   mp_limb_t *tp, s0[1], cc, high, rl;
 424   int c;
 425   mp_size_t rn, tn;
 426   TMP_DECL;
 427
 428   ASSERT (nn > 0);
 429   ASSERT_MPN (np, nn);
 430
 431   ASSERT (np[nn - 1] != 0);
 432   ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
 433   ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
 434   ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
 435
 436   high = np[nn - 1];
 437   if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
 438     c = 0;
 439   else
 440     {
 441       count_leading_zeros (c, high);
 442       c -= GMP_NAIL_BITS;
 443
 444       c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
 445     }
 446   if (nn == 1) {
 447     if (c == 0)
 448       {
 449         sp[0] = mpn_sqrtrem1 (&rl, high);
 450         if (rp != NULL)
 451           rp[0] = rl;
 452       }
 453     else
 454       {
 455         cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
 456         sp[0] = cc;
 457         if (rp != NULL)
 458           rp[0] = rl = high - cc*cc;
 459       }
 460     return rl != 0;
 461   }
 462   tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
 463
 464   if ((rp == NULL) && (nn > 8))
 465     return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
 466   TMP_MARK;
 467   if (((nn & 1) | c) != 0)
 468     {
 469       mp_limb_t mask;
 470       mp_ptr scratch;
 471       TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
 472       tp[0] = 0;             /* needed only when 2*tn > nn, but saves a test */
 473       if (c != 0)
 474         mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
 475       else
 476         MPN_COPY (tp + (nn & 1), np, nn);
 477       c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;            /* c now represents k */
 478       mask = (CNST_LIMB (1) << c) - 1;
 479       rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
 480       /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
 481          thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
 482       s0[0] = sp[0] & mask;     /* S mod 2^k */
 483       rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);       /* R = R + 2*s0*S */
 484       cc = mpn_submul_1 (tp, s0, 1, s0[0]);
 485       rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
 486       mpn_rshift (sp, sp, tn, c);
 487       tp[tn] = rl;
 488       if (rp == NULL)
 489         rp = tp;
 490       c = c << 1;
 491       if (c < GMP_NUMB_BITS)
 492         tn++;
 493       else
 494         {
 495           tp++;
 496           c -= GMP_NUMB_BITS;
 497         }
 498       if (c != 0)
 499         mpn_rshift (rp, tp, tn, c);
 500       else
 501         MPN_COPY_INCR (rp, tp, tn);
 502       rn = tn;
 503     }
 504   else
 505     {
 506       if (rp != np)
 507         {
 508           if (rp == NULL) /* nn <= 8 */
 509             rp = TMP_SALLOC_LIMBS (nn);
 510           MPN_COPY (rp, np, nn);
 511         }
 512       rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
 513     }
 514
 515   MPN_NORMALIZE (rp, rn);
 516
 517   TMP_FREE;
 518   return rn;
 519 }