contrib/mpfr/rec_sqrt.c

   1 /* mpfr_rec_sqrt -- inverse square root
   2
   3 Copyright 2008, 2009 Free Software Foundation, Inc.
   4 Contributed by the Arenaire and Cacao projects, INRIA.
   5
   6 This file is part of the GNU MPFR Library.
   7
   8 The GNU MPFR Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 2.1 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MPFR Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MPFR Library; see the file COPYING.LIB.  If not, write to
  20 the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
  21 MA 02110-1301, USA. */
  22
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25
  26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
  27 #include "mpfr-impl.h"
  28
  29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_BITS_PER_MP_LIMB) + 1)
  30
  31 #define MPFR_COM_N(x,y,n)                               \
  32   {                                                     \
  33     mp_size_t i;                                        \
  34     for (i = 0; i < n; i++)                             \
  35       *((x)+i) = ~*((y)+i);                             \
  36   }
  37
  38 /* Put in X a p-bit approximation of 1/sqrt(A),
  39    where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
  40    A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
  41    where B = 2^GMP_NUMB_BITS.
  42
  43    We have 1 <= A < 4 and 1/2 <= X < 1.
  44
  45    The error in the approximate result with respect to the true
  46    value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
  47
  48    Note: x and a are left-aligned, i.e., the most significant bit of
  49    a[an-1] is set, and so is the most significant bit of the output x[n-1].
  50
  51    If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
  52    A are taken into account to compute the approximation of 1/sqrt(A), but
  53    whether or not they are zero, the error between X and 1/sqrt(A) is bounded
  54    by 1 ulp(X) [in precision p].
  55    The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
  56    are set to 0.
  57
  58    Assumptions:
  59    (1) A should be normalized, i.e., the most significant bit of a[an-1]
  60        should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
  61    (2) p >= 12
  62    (3) {a, an} and {x, n} should not overlap
  63    (4) GMP_NUMB_BITS >= 12 and is even
  64
  65    Note: this routine is much more efficient when ap is small compared to p,
  66    including the case where ap <= GMP_NUMB_BITS, thus it can be used to
  67    implement an efficient mpfr_rec_sqrt_ui function.
  68
  69    Reference: Modern Computer Algebra, Richard Brent and Paul Zimmermann,
  70    http://www.loria.fr/~zimmerma/mca/pub226.html
  71 */
  72 static void
  73 mpfr_mpn_rec_sqrt (mp_ptr x, mp_prec_t p,
  74                    mp_srcptr a, mp_prec_t ap, int as)
  75
  76 {
  77   /* the following T1 and T2 are bipartite tables giving initial
  78      approximation for the inverse square root, with 13-bit input split in
  79      5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
  80      with i = a*2^8 + b*2^4 + c, we use for approximation of
  81      2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
  82      The largest error is obtained for i = 2054, where x = 2044,
  83      and 2048/sqrt(i/2048) = 2045.006576...
  84   */
  85   static short int T1[384] = {
  86 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
  87 1944, 1938, 1931, /* a=8 */
  88 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
  89 1844, 1838, 1832, /* a=9 */
  90 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
  91 1757, 1752, 1747, /* a=10 */
  92 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
  93 1681, 1677, 1673, /* a=11 */
  94 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
  95 1615, 1611, 1607, /* a=12 */
  96 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
  97 1556, 1552, 1549, /* a=13 */
  98 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
  99 1502, 1499, 1496, /* a=14 */
 100 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
 101 1454, 1451, 1449, /* a=15 */
 102 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
 103 1411, 1408, 1405, /* a=16 */
 104 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
 105 1371, 1368, 1366, /* a=17 */
 106 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
 107 1333, 1331, 1329, /* a=18 */
 108 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
 109 1300, 1298, 1296, /* a=19 */
 110 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
 111 1268, 1266, 1265, /* a=20 */
 112 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
 113 1239, 1237, 1235, /* a=21 */
 114 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
 115 1212, 1210, 1208, /* a=22 */
 116 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
 117 1185, 1184, 1182, /* a=23 */
 118 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
 119 1162, 1160, 1159, /* a=24 */
 120 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
 121 1139, 1137, 1136, /* a=25 */
 122 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
 123 1117, 1116, 1115, /* a=26 */
 124 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
 125 1098, 1096, 1095, /* a=27 */
 126 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
 127 1078, 1077, 1076, /* a=28 */
 128 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
 129 1060, 1059, 1058, /* a=29 */
 130 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
 131 1043, 1042, 1041, /* a=30 */
 132 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
 133 1027, 1026, 1025 /* a=31 */
 134 };
 135   static unsigned char T2[384] = {
 136     7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
 137     6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
 138     5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
 139     4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
 140     3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
 141     3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
 142     3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
 143     2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
 144     2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
 145     2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
 146     3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
 147     2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
 148     1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
 149     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
 150     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
 151     2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
 152     1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
 153     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
 154     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
 155     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
 156     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
 157     1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
 158     1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
 159     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
 160 };
 161   mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
 162   mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
 163
 164   /* A should be normalized */
 165   MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
 166   /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
 167      Since that does not depend on MPFR, we always check this. */
 168   MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
 169   /* {a, an} and {x, n} should not overlap */
 170   MPFR_ASSERTD((a + an <= x) || (x + n <= a));
 171   MPFR_ASSERTD(p >= 11);
 172
 173   if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
 174     {
 175       a += an - n;
 176       an = n;
 177     }
 178
 179   if (p == 11) /* should happen only from recursive calls */
 180     {
 181       unsigned long i, ab, ac;
 182       mp_limb_t t;
 183
 184       /* take the 12+as most significant bits of A */
 185       i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
 186       /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
 187       ab = i >> 4;
 188       ac = (ab & 0x3F0) | (i & 0x0F);
 189       t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
 190       x[0] = t << (GMP_NUMB_BITS - p);
 191     }
 192   else /* p >= 12 */
 193     {
 194       mp_prec_t h, pl;
 195       mp_ptr r, s, t, u;
 196       mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
 197       mp_limb_t neg, cy, cu;
 198       MPFR_TMP_DECL(marker);
 199
 200       /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
 201       h = (p < 18) ? 11 : (p >> 1) + 2;
 202
 203       xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
 204       rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
 205       ln = n - xn;             /* remaining limbs to be computed */
 206
 207       /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
 208          we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
 209          thus the h-3 most significant bits of t should be zero,
 210          which is in fact h+1+as-3 because of the normalization of A.
 211          This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs. */
 212       th = (h + 1 + as - 3) >> MPFR_LOG2_BITS_PER_MP_LIMB;
 213       tn = LIMB_SIZE(2 * h + 1 + as);
 214
 215       /* we need h+1+as bits of a */
 216       ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
 217                                       needed for the recursive call*/
 218       if (MPFR_UNLIKELY(ahn > an))
 219         ahn = an;
 220       mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
 221       /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
 222          limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
 223
 224       MPFR_TMP_MARK (marker);
 225       /* first step: square X in r, result is exact */
 226       un = xn + (tn - th);
 227       /* We use the same temporary buffer to store r and u: r needs 2*xn
 228          limbs where u needs xn+(tn-th) limbs. Since tn can store at least
 229          2h bits, and th at most h bits, then tn-th can store at least h bits,
 230          thus tn - th >= xn, and reserving the space for u is enough. */
 231       MPFR_ASSERTD(2 * xn <= un);
 232       u = r = (mp_ptr) MPFR_TMP_ALLOC (un * sizeof (mp_limb_t));
 233       if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
 234                                      thus ln = 0 */
 235         {
 236           MPFR_ASSERTD(ln == 0);
 237           cy = x[0] >> (GMP_NUMB_BITS >> 1);
 238           r ++;
 239           r[0] = cy * cy;
 240         }
 241       else if (xn == 1) /* xn=1, rn=2 */
 242         umul_ppmm(r[1], r[0], x[ln], x[ln]);
 243       else
 244         {
 245           mpn_mul_n (r, x + ln, x + ln, xn);
 246           if (rn < 2 * xn)
 247             r ++;
 248         }
 249       /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
 250          limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
 251
 252       /* Second step: s <- A * (r^2), and truncate the low ap bits,
 253          i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
 254        */
 255       sn = an + rn;
 256       s = (mp_ptr) MPFR_TMP_ALLOC (sn * sizeof (mp_limb_t));
 257       if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
 258                            and 2h >= p+3 */
 259         {
 260           /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
 261              bits from A */
 262           /* since n=1, and we ensured an <= n, we also have an=1 */
 263           MPFR_ASSERTD(an == 1);
 264           umul_ppmm (s[1], s[0], r[0], a[0]);
 265         }
 266       else
 267         {
 268           /* we have p <= n * GMP_NUMB_BITS
 269              2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
 270              thus n <= rn <= n + 1 */
 271           MPFR_ASSERTD(rn <= n + 1);
 272           /* since we ensured an <= n, we have an <= rn */
 273           MPFR_ASSERTD(an <= rn);
 274           mpn_mul (s, r, rn, a, an);
 275           /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
 276              100000... or 011111... if as=0, or
 277              010000... or 001111... if as=1.
 278              We ignore the bits of s after the first 2h+1+as ones.
 279           */
 280         }
 281
 282       /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
 283          limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
 284       t = s + sn - tn; /* pointer to low limb of the high part of t */
 285       /* the upper h-3 bits of 1-t should be zero,
 286          where 1 corresponds to the most significant bit of t[tn-1] if as=0,
 287          and to the 2nd most significant bit of t[tn-1] if as=1 */
 288
 289       /* compute t <- 1 - t, which is B^tn - {t, tn+1},
 290          with rounding towards -Inf, i.e., rounding the input t towards +Inf.
 291          We could only modify the low tn - th limbs from t, but it gives only
 292          a small speedup, and would make the code more complex.
 293       */
 294       neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
 295       if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
 296                        is the part truncated above, thus 1 - t rounded to -Inf
 297                        is 1 - th - ulp(th) */
 298         {
 299           /* since the 1+as most significant bits of t are zero, set them
 300              to 1 before the one-complement */
 301           t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
 302           MPFR_COM_N (t, t, tn);
 303           /* we should add 1 here to get 1-th complement, and subtract 1 for
 304              -ulp(th), thus we do nothing */
 305         }
 306       else /* negative case: we want 1 - t rounded towards -Inf, i.e.,
 307               th + eps rounded towards +Inf, which is th + ulp(th):
 308               we discard the bit corresponding to 1,
 309               and we add 1 to the least significant bit of t */
 310         {
 311           t[tn - 1] ^= neg;
 312           mpn_add_1 (t, t, tn, 1);
 313         }
 314       tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
 315                    the high limbs of {t, tn} are zero */
 316
 317       /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
 318          th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
 319       MPFR_ASSERTD(tn > 0);
 320
 321       /* u <- x * t, where {t, tn} contains at least h+3 bits,
 322          and {x, xn} contains h bits, thus tn >= xn */
 323       MPFR_ASSERTD(tn >= xn);
 324       if (tn == 1) /* necessarily xn=1 */
 325         umul_ppmm (u[1], u[0], t[0], x[ln]);
 326       else
 327         mpn_mul (u, t, tn, x + ln, xn);
 328
 329       /* we have already discarded the upper th high limbs of t, thus we only
 330          have to consider the upper n - th limbs of u */
 331       un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
 332                       h = ceil((p+3)/2) <= (p+4)/2,
 333                       th*GMP_NUMB_BITS <= h-1 <= p/2+1,
 334                       thus (n-th)*GMP_NUMB_BITS >= p/2-1.
 335                    */
 336       MPFR_ASSERTD(un > 0);
 337       u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
 338                                            = xn + original_tn - n
 339                               = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
 340                               since 2h >= p+3 */
 341       MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
 342
 343       /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
 344          need to add or subtract.
 345          In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
 346          u by 2. */
 347
 348       if (as == 1)
 349         /* shift on un+1 limbs to get most significant bit of u[-1] into
 350            least significant bit of u[0] */
 351         mpn_lshift (u - 1, u - 1, un + 1, 1);
 352
 353       pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
 354       /* We want that the low pl bits are zero after rounding to nearest,
 355          thus we round u to nearest at bit pl-1 of u[0] */
 356       if (pl > 0)
 357         {
 358           cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
 359           /* mask bits 0..pl-1 of u[0] */
 360           u[0] &= ~MPFR_LIMB_MASK(pl);
 361         }
 362       else /* round bit is in u[-1] */
 363         cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
 364
 365       /* We already have filled {x + ln, xn = n - ln}, and we want to add or
 366          subtract cu*B^un + {u, un} at position x.
 367          un = n - th, where th contains <= h+1+as-3<=h-1 bits
 368          ln = n - xn, where xn contains >= h bits
 369          thus un > ln.
 370          Warning: ln might be zero.
 371       */
 372       MPFR_ASSERTD(un > ln);
 373       /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
 374          p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
 375       MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
 376       /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
 377          we need to add or subtract the overlapping part {u + ln, un - ln} */
 378       if (neg == 0)
 379         {
 380           if (ln > 0)
 381             MPN_COPY (x, u, ln);
 382           cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
 383           /* add cu at x+un */
 384           cy += mpn_add_1 (x + un, x + un, th, cu);
 385         }
 386       else /* negative case */
 387         {
 388           /* subtract {u+ln, un-ln} from {x+ln,un} */
 389           cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
 390           /* carry cy is at x+un, like cu */
 391           cy = mpn_sub_1 (x + un, x + un, th, cy + cu); /* n - un = th */
 392           /* cy cannot be zero, since the most significant bit of Xh is 1,
 393              and the correction is bounded by 2^{-h+3} */
 394           MPFR_ASSERTD(cy == 0);
 395           if (ln > 0)
 396             {
 397               MPFR_COM_N (x, u, ln);
 398               /* we must add one for the 2-complement ... */
 399               cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
 400               /* ... and subtract 1 at x[ln], where n = ln + xn */
 401               cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
 402             }
 403         }
 404
 405       /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
 406          have X = B^n, and setting X to 1-2^{-p} satisties the error bound
 407          of 1 ulp. */
 408       if (MPFR_UNLIKELY(cy != 0))
 409         {
 410           cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
 411           MPFR_ASSERTD(cy == 0);
 412         }
 413
 414       MPFR_TMP_FREE (marker);
 415     }
 416 }
 417
 418 int
 419 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mp_rnd_t rnd_mode)
 420 {
 421   mp_prec_t rp, up, wp;
 422   mp_size_t rn, wn;
 423   int s, cy, inex;
 424   mp_ptr x;
 425   int out_of_place;
 426   MPFR_TMP_DECL(marker);
 427
 428   MPFR_LOG_FUNC (("x[%#R]=%R rnd=%d", u, u, rnd_mode),
 429                  ("y[%#R]=%R inexact=%d", r, r, inex));
 430
 431   /* special values */
 432   if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
 433     {
 434       if (MPFR_IS_NAN(u))
 435         {
 436           MPFR_SET_NAN(r);
 437           MPFR_RET_NAN;
 438         }
 439       else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
 440         {
 441           /* 0+ or 0- */
 442           MPFR_SET_INF(r);
 443           MPFR_SET_POS(r);
 444           MPFR_RET(0); /* Inf is exact */
 445         }
 446       else
 447         {
 448           MPFR_ASSERTD(MPFR_IS_INF(u));
 449           /* 1/sqrt(-Inf) = NAN */
 450           if (MPFR_IS_NEG(u))
 451             {
 452               MPFR_SET_NAN(r);
 453               MPFR_RET_NAN;
 454             }
 455           /* 1/sqrt(+Inf) = +0 */
 456           MPFR_SET_POS(r);
 457           MPFR_SET_ZERO(r);
 458           MPFR_RET(0);
 459         }
 460     }
 461
 462   /* if u < 0, 1/sqrt(u) is NaN */
 463   if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
 464     {
 465       MPFR_SET_NAN(r);
 466       MPFR_RET_NAN;
 467     }
 468
 469   MPFR_CLEAR_FLAGS(r);
 470   MPFR_SET_POS(r);
 471
 472   rp = MPFR_PREC(r); /* output precision */
 473   up = MPFR_PREC(u); /* input precision */
 474   wp = rp + 11;      /* initial working precision */
 475
 476   /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
 477      If e is even, we compute an approximation of X of (4U)^{-1/2},
 478      and the result is X*2^(-(e-2)/2) [case s=1].
 479      If e is odd, we compute an approximation of X of (2U)^{-1/2},
 480      and the result is X*2^(-(e-1)/2) [case s=0]. */
 481
 482   /* parity of the exponent of u */
 483   s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
 484
 485   rn = LIMB_SIZE(rp);
 486
 487   /* for the first iteration, if rp + 11 fits into rn limbs, we round up
 488      up to a full limb to maximize the chance of rounding, while avoiding
 489      to allocate extra space */
 490   wp = rp + 11;
 491   if (wp < rn * BITS_PER_MP_LIMB)
 492     wp = rn * BITS_PER_MP_LIMB;
 493   for (;;)
 494     {
 495       MPFR_TMP_MARK (marker);
 496       wn = LIMB_SIZE(wp);
 497       out_of_place = (r == u) || (wn > rn);
 498       if (out_of_place)
 499         x = (mp_ptr) MPFR_TMP_ALLOC (wn * sizeof (mp_limb_t));
 500       else
 501         x = MPFR_MANT(r);
 502       mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
 503       /* If the input was not truncated, the error is at most one ulp;
 504          if the input was truncated, the error is at most two ulps
 505          (see algorithms.tex). */
 506       if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
 507                                      rp + (rnd_mode == GMP_RNDN))))
 508         break;
 509
 510       /* We detect only now the exact case where u=2^(2e), to avoid
 511          slowing down the average case. This can happen only when the
 512          mantissa is exactly 1/2 and the exponent is odd. */
 513       if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
 514         {
 515           mp_prec_t pl = wn * BITS_PER_MP_LIMB - wp;
 516
 517           /* we should have x=111...111 */
 518           mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
 519           x[wn - 1] = MPFR_LIMB_HIGHBIT;
 520           s += 2;
 521           break; /* go through */
 522         }
 523       MPFR_TMP_FREE(marker);
 524
 525       wp += BITS_PER_MP_LIMB;
 526     }
 527   cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
 528   MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
 529   if (MPFR_UNLIKELY(cy != 0))
 530     {
 531       MPFR_EXP(r) ++;
 532       MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
 533     }
 534   MPFR_TMP_FREE(marker);
 535   return mpfr_check_range (r, inex, rnd_mode);
 536 }