source/libs/mpfr/mpfr-3.1.3/src/rec_sqrt.c

   1 /* mpfr_rec_sqrt -- inverse square root
   2
   3 Copyright 2008-2015 Free Software Foundation, Inc.
   4 Contributed by the AriC and Caramel projects, INRIA.
   5
   6 This file is part of the GNU MPFR Library.
   7
   8 The GNU MPFR Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MPFR Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
  20 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
  21 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25
  26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
  27 #include "mpfr-impl.h"
  28
  29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
  30
  31 #define MPFR_COM_N(x,y,n)                               \
  32   {                                                     \
  33     mp_size_t i;                                        \
  34     for (i = 0; i < n; i++)                             \
  35       *((x)+i) = ~*((y)+i);                             \
  36   }
  37
  38 /* Put in X a p-bit approximation of 1/sqrt(A),
  39    where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
  40    A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
  41    where B = 2^GMP_NUMB_BITS.
  42
  43    We have 1 <= A < 4 and 1/2 <= X < 1.
  44
  45    The error in the approximate result with respect to the true
  46    value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
  47
  48    Note: x and a are left-aligned, i.e., the most significant bit of
  49    a[an-1] is set, and so is the most significant bit of the output x[n-1].
  50
  51    If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
  52    A are taken into account to compute the approximation of 1/sqrt(A), but
  53    whether or not they are zero, the error between X and 1/sqrt(A) is bounded
  54    by 1 ulp(X) [in precision p].
  55    The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
  56    are set to 0.
  57
  58    Assumptions:
  59    (1) A should be normalized, i.e., the most significant bit of a[an-1]
  60        should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
  61    (2) p >= 12
  62    (3) {a, an} and {x, n} should not overlap
  63    (4) GMP_NUMB_BITS >= 12 and is even
  64
  65    Note: this routine is much more efficient when ap is small compared to p,
  66    including the case where ap <= GMP_NUMB_BITS, thus it can be used to
  67    implement an efficient mpfr_rec_sqrt_ui function.
  68
  69    References:
  70    [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
  71    http://www.loria.fr/~zimmerma/mca/pub226.html
  72 */
  73 static void
  74 mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
  75                    mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
  76
  77 {
  78   /* the following T1 and T2 are bipartite tables giving initial
  79      approximation for the inverse square root, with 13-bit input split in
  80      5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
  81      with i = a*2^8 + b*2^4 + c, we use for approximation of
  82      2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
  83      The largest error is obtained for i = 2054, where x = 2044,
  84      and 2048/sqrt(i/2048) = 2045.006576...
  85   */
  86   static short int T1[384] = {
  87 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
  88 1944, 1938, 1931, /* a=8 */
  89 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
  90 1844, 1838, 1832, /* a=9 */
  91 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
  92 1757, 1752, 1747, /* a=10 */
  93 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
  94 1681, 1677, 1673, /* a=11 */
  95 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
  96 1615, 1611, 1607, /* a=12 */
  97 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
  98 1556, 1552, 1549, /* a=13 */
  99 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
 100 1502, 1499, 1496, /* a=14 */
 101 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
 102 1454, 1451, 1449, /* a=15 */
 103 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
 104 1411, 1408, 1405, /* a=16 */
 105 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
 106 1371, 1368, 1366, /* a=17 */
 107 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
 108 1333, 1331, 1329, /* a=18 */
 109 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
 110 1300, 1298, 1296, /* a=19 */
 111 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
 112 1268, 1266, 1265, /* a=20 */
 113 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
 114 1239, 1237, 1235, /* a=21 */
 115 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
 116 1212, 1210, 1208, /* a=22 */
 117 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
 118 1185, 1184, 1182, /* a=23 */
 119 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
 120 1162, 1160, 1159, /* a=24 */
 121 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
 122 1139, 1137, 1136, /* a=25 */
 123 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
 124 1117, 1116, 1115, /* a=26 */
 125 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
 126 1098, 1096, 1095, /* a=27 */
 127 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
 128 1078, 1077, 1076, /* a=28 */
 129 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
 130 1060, 1059, 1058, /* a=29 */
 131 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
 132 1043, 1042, 1041, /* a=30 */
 133 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
 134 1027, 1026, 1025 /* a=31 */
 135 };
 136   static unsigned char T2[384] = {
 137     7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
 138     6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
 139     5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
 140     4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
 141     3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
 142     3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
 143     3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
 144     2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
 145     2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
 146     2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
 147     3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
 148     2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
 149     1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
 150     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
 151     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
 152     2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
 153     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
 154     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
 155     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
 156     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
 157     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
 158     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
 159     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
 160     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
 161 };
 162   mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
 163   mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
 164
 165   /* A should be normalized */
 166   MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
 167   /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
 168      Since that does not depend on MPFR, we always check this. */
 169   MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
 170   /* {a, an} and {x, n} should not overlap */
 171   MPFR_ASSERTD((a + an <= x) || (x + n <= a));
 172   MPFR_ASSERTD(p >= 11);
 173
 174   if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
 175     {
 176       a += an - n;
 177       an = n;
 178     }
 179
 180   if (p == 11) /* should happen only from recursive calls */
 181     {
 182       unsigned long i, ab, ac;
 183       mp_limb_t t;
 184
 185       /* take the 12+as most significant bits of A */
 186       i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
 187       /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
 188       ab = i >> 4;
 189       ac = (ab & 0x3F0) | (i & 0x0F);
 190       t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
 191       x[0] = t << (GMP_NUMB_BITS - p);
 192     }
 193   else /* p >= 12 */
 194     {
 195       mpfr_prec_t h, pl;
 196       mpfr_limb_ptr r, s, t, u;
 197       mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
 198       mp_limb_t neg, cy, cu;
 199       MPFR_TMP_DECL(marker);
 200
 201       /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
 202          and A/4 if as=1. */
 203
 204       /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
 205       h = (p < 18) ? 11 : (p >> 1) + 2;
 206
 207       xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
 208       rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
 209       ln = n - xn;             /* remaining limbs to be computed */
 210
 211       /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
 212          we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
 213          thus the h-3 most significant bits of t should be zero,
 214          which is in fact h+1+as-3 because of the normalization of A.
 215          This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
 216
 217          More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
 218          <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
 219          since A < 4 and h >= 11, thus
 220          |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
 221          This is sufficient to prove that the upper limb of {t,tn} below is
 222          less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
 223       */
 224       th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
 225       tn = LIMB_SIZE(2 * h + 1 + as);
 226
 227       /* we need h+1+as bits of a */
 228       ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
 229                                       needed for the recursive call*/
 230       if (MPFR_UNLIKELY(ahn > an))
 231         ahn = an;
 232       mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
 233       /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
 234          limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
 235
 236       /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
 237
 238       MPFR_TMP_MARK (marker);
 239       /* first step: square X in r, result is exact */
 240       un = xn + (tn - th);
 241       /* We use the same temporary buffer to store r and u: r needs 2*xn
 242          limbs where u needs xn+(tn-th) limbs. Since tn can store at least
 243          2h bits, and th at most h bits, then tn-th can store at least h bits,
 244          thus tn - th >= xn, and reserving the space for u is enough. */
 245       MPFR_ASSERTD(2 * xn <= un);
 246       u = r = MPFR_TMP_LIMBS_ALLOC (un);
 247       if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
 248                                      thus ln = 0 */
 249         {
 250           MPFR_ASSERTD(ln == 0);
 251           cy = x[0] >> (GMP_NUMB_BITS >> 1);
 252           r ++;
 253           r[0] = cy * cy;
 254         }
 255       else if (xn == 1) /* xn=1, rn=2 */
 256         umul_ppmm(r[1], r[0], x[ln], x[ln]);
 257       else
 258         {
 259           mpn_mul_n (r, x + ln, x + ln, xn);
 260           /* we have {r, 2*xn} = X_h^2 */
 261           if (rn < 2 * xn)
 262             r ++;
 263         }
 264       /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
 265          limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
 266
 267       /* Second step: s <- A * (r^2), and truncate the low ap bits,
 268          i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
 269        */
 270       sn = an + rn;
 271       s = MPFR_TMP_LIMBS_ALLOC (sn);
 272       if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
 273                            and 2h >= p+3 */
 274         {
 275           /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
 276              bits from A */
 277           /* since n=1, and we ensured an <= n, we also have an=1 */
 278           MPFR_ASSERTD(an == 1);
 279           umul_ppmm (s[1], s[0], r[0], a[0]);
 280         }
 281       else
 282         {
 283           /* we have p <= n * GMP_NUMB_BITS
 284              2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
 285              thus n <= rn <= n + 1 */
 286           MPFR_ASSERTD(rn <= n + 1);
 287           /* since we ensured an <= n, we have an <= rn */
 288           MPFR_ASSERTD(an <= rn);
 289           mpn_mul (s, r, rn, a, an);
 290           /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
 291              100000... or 011111... if as=0, or
 292              010000... or 001111... if as=1.
 293              We ignore the bits of s after the first 2h+1+as ones.
 294              We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
 295         }
 296
 297       /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
 298          limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
 299       t = s + sn - tn; /* pointer to low limb of the high part of t */
 300       /* the upper h-3 bits of 1-t should be zero,
 301          where 1 corresponds to the most significant bit of t[tn-1] if as=0,
 302          and to the 2nd most significant bit of t[tn-1] if as=1 */
 303
 304       /* compute t <- 1 - t, which is B^tn - {t, tn+1},
 305          with rounding toward -Inf, i.e., rounding the input t toward +Inf.
 306          We could only modify the low tn - th limbs from t, but it gives only
 307          a small speedup, and would make the code more complex.
 308       */
 309       neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
 310       if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
 311                        is the part truncated above, thus 1 - t rounded to -Inf
 312                        is 1 - th - ulp(th) */
 313         {
 314           /* since the 1+as most significant bits of t are zero, set them
 315              to 1 before the one-complement */
 316           t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
 317           MPFR_COM_N (t, t, tn);
 318           /* we should add 1 here to get 1-th complement, and subtract 1 for
 319              -ulp(th), thus we do nothing */
 320         }
 321       else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
 322               th + eps rounded toward +Inf, which is th + ulp(th):
 323               we discard the bit corresponding to 1,
 324               and we add 1 to the least significant bit of t */
 325         {
 326           t[tn - 1] ^= neg;
 327           mpn_add_1 (t, t, tn, 1);
 328         }
 329       tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
 330                    the high limbs of {t, tn} are zero */
 331
 332       /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
 333          th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
 334       MPFR_ASSERTD(tn > 0);
 335
 336       /* u <- x * t, where {t, tn} contains at least h+3 bits,
 337          and {x, xn} contains h bits, thus tn >= xn */
 338       MPFR_ASSERTD(tn >= xn);
 339       if (tn == 1) /* necessarily xn=1 */
 340         umul_ppmm (u[1], u[0], t[0], x[ln]);
 341       else
 342         mpn_mul (u, t, tn, x + ln, xn);
 343
 344       /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
 345
 346       /* we have already discarded the upper th high limbs of t, thus we only
 347          have to consider the upper n - th limbs of u */
 348       un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
 349                       h = ceil((p+3)/2) <= (p+4)/2,
 350                       th*GMP_NUMB_BITS <= h-1 <= p/2+1,
 351                       thus (n-th)*GMP_NUMB_BITS >= p/2-1.
 352                    */
 353       MPFR_ASSERTD(un > 0);
 354       u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
 355                                            = xn + original_tn - n
 356                               = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
 357                               since 2h >= p+3 */
 358       MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
 359
 360       /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
 361          need to add or subtract.
 362          In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
 363          u by 2. */
 364
 365       if (as == 1)
 366         /* shift on un+1 limbs to get most significant bit of u[-1] into
 367            least significant bit of u[0] */
 368         mpn_lshift (u - 1, u - 1, un + 1, 1);
 369
 370       /* now {u,un} represents U / 2 from Algorithm 3.9 */
 371
 372       pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
 373       /* We want that the low pl bits are zero after rounding to nearest,
 374          thus we round u to nearest at bit pl-1 of u[0] */
 375       if (pl > 0)
 376         {
 377           cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
 378           /* mask bits 0..pl-1 of u[0] */
 379           u[0] &= ~MPFR_LIMB_MASK(pl);
 380         }
 381       else /* round bit is in u[-1] */
 382         cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
 383       MPFR_ASSERTN(cu == 0);
 384
 385       /* We already have filled {x + ln, xn = n - ln}, and we want to add or
 386          subtract {u, un} at position x.
 387          un = n - th, where th contains <= h+1+as-3<=h-1 bits
 388          ln = n - xn, where xn contains >= h bits
 389          thus un > ln.
 390          Warning: ln might be zero.
 391       */
 392       MPFR_ASSERTD(un > ln);
 393       /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
 394          p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
 395       MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
 396       /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
 397          we need to add or subtract the overlapping part {u + ln, un - ln} */
 398       /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
 399          below (with size = th) mustn't be used. */
 400       if (neg == 0)
 401         {
 402           if (ln > 0)
 403             MPN_COPY (x, u, ln);
 404           cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
 405           /* cy is the carry at x + (ln + xn) = x + n */
 406         }
 407       else /* negative case */
 408         {
 409           /* subtract {u+ln, un-ln} from {x+ln,un} */
 410           cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
 411           /* cy is the borrow at x + (ln + xn) = x + n */
 412
 413           /* cy cannot be non-zero, since the most significant bit of Xh is 1,
 414              and the correction is bounded by 2^{-h+3} */
 415           MPFR_ASSERTD(cy == 0);
 416           if (ln > 0)
 417             {
 418               MPFR_COM_N (x, u, ln);
 419               /* we must add one for the 2-complement ... */
 420               cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
 421               /* ... and subtract 1 at x[ln], where n = ln + xn */
 422               cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
 423             }
 424         }
 425
 426       /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
 427          have X = B^n, and setting X to 1-2^{-p} satisties the error bound
 428          of 1 ulp. */
 429       if (MPFR_UNLIKELY(cy != 0))
 430         {
 431           cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
 432           MPFR_ASSERTD(cy == 0);
 433         }
 434
 435       MPFR_TMP_FREE (marker);
 436     }
 437 }
 438
 439 int
 440 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
 441 {
 442   mpfr_prec_t rp, up, wp;
 443   mp_size_t rn, wn;
 444   int s, cy, inex;
 445   mpfr_limb_ptr x;
 446   MPFR_TMP_DECL(marker);
 447
 448   MPFR_LOG_FUNC
 449     (("x[%Pu]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
 450      ("y[%Pu]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
 451
 452   /* special values */
 453   if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
 454     {
 455       if (MPFR_IS_NAN(u))
 456         {
 457           MPFR_SET_NAN(r);
 458           MPFR_RET_NAN;
 459         }
 460       else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
 461         {
 462           /* 0+ or 0- */
 463           MPFR_SET_INF(r);
 464           MPFR_SET_POS(r);
 465           mpfr_set_divby0 ();
 466           MPFR_RET(0); /* Inf is exact */
 467         }
 468       else
 469         {
 470           MPFR_ASSERTD(MPFR_IS_INF(u));
 471           /* 1/sqrt(-Inf) = NAN */
 472           if (MPFR_IS_NEG(u))
 473             {
 474               MPFR_SET_NAN(r);
 475               MPFR_RET_NAN;
 476             }
 477           /* 1/sqrt(+Inf) = +0 */
 478           MPFR_SET_POS(r);
 479           MPFR_SET_ZERO(r);
 480           MPFR_RET(0);
 481         }
 482     }
 483
 484   /* if u < 0, 1/sqrt(u) is NaN */
 485   if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
 486     {
 487       MPFR_SET_NAN(r);
 488       MPFR_RET_NAN;
 489     }
 490
 491   MPFR_SET_POS(r);
 492
 493   rp = MPFR_PREC(r); /* output precision */
 494   up = MPFR_PREC(u); /* input precision */
 495   wp = rp + 11;      /* initial working precision */
 496
 497   /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
 498      If e is even, we compute an approximation of X of (4U)^{-1/2},
 499      and the result is X*2^(-(e-2)/2) [case s=1].
 500      If e is odd, we compute an approximation of X of (2U)^{-1/2},
 501      and the result is X*2^(-(e-1)/2) [case s=0]. */
 502
 503   /* parity of the exponent of u */
 504   s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
 505
 506   rn = LIMB_SIZE(rp);
 507
 508   /* for the first iteration, if rp + 11 fits into rn limbs, we round up
 509      up to a full limb to maximize the chance of rounding, while avoiding
 510      to allocate extra space */
 511   wp = rp + 11;
 512   if (wp < rn * GMP_NUMB_BITS)
 513     wp = rn * GMP_NUMB_BITS;
 514   for (;;)
 515     {
 516       MPFR_TMP_MARK (marker);
 517       wn = LIMB_SIZE(wp);
 518       if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
 519         x = MPFR_TMP_LIMBS_ALLOC (wn);
 520       else
 521         x = MPFR_MANT(r);
 522       mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
 523       /* If the input was not truncated, the error is at most one ulp;
 524          if the input was truncated, the error is at most two ulps
 525          (see algorithms.tex). */
 526       if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
 527                                      rp + (rnd_mode == MPFR_RNDN))))
 528         break;
 529
 530       /* We detect only now the exact case where u=2^(2e), to avoid
 531          slowing down the average case. This can happen only when the
 532          mantissa is exactly 1/2 and the exponent is odd. */
 533       if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
 534         {
 535           mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
 536
 537           /* we should have x=111...111 */
 538           mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
 539           x[wn - 1] = MPFR_LIMB_HIGHBIT;
 540           s += 2;
 541           break; /* go through */
 542         }
 543       MPFR_TMP_FREE(marker);
 544
 545       wp += GMP_NUMB_BITS;
 546     }
 547   cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
 548   MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
 549   if (MPFR_UNLIKELY(cy != 0))
 550     {
 551       MPFR_EXP(r) ++;
 552       MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
 553     }
 554   MPFR_TMP_FREE(marker);
 555   return mpfr_check_range (r, inex, rnd_mode);
 556 }