PIPE - Fix a panic introduced by the last commit.
[dragonfly.git] / contrib / mpfr / rec_sqrt.c
blobda12a1de7091373ffc7a92a5682fd9f8b9769c71
1 /* mpfr_rec_sqrt -- inverse square root
3 Copyright 2008, 2009 Free Software Foundation, Inc.
4 Contributed by the Arenaire and Cacao projects, INRIA.
6 This file is part of the GNU MPFR Library.
8 The GNU MPFR Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or (at your
11 option) any later version.
13 The GNU MPFR Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MPFR Library; see the file COPYING.LIB. If not, write to
20 the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
21 MA 02110-1301, USA. */
23 #include <stdio.h>
24 #include <stdlib.h>
26 #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
27 #include "mpfr-impl.h"
29 #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_BITS_PER_MP_LIMB) + 1)
31 #define MPFR_COM_N(x,y,n) \
32 { \
33 mp_size_t i; \
34 for (i = 0; i < n; i++) \
35 *((x)+i) = ~*((y)+i); \
38 /* Put in X a p-bit approximation of 1/sqrt(A),
39 where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
40 A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
41 where B = 2^GMP_NUMB_BITS.
43 We have 1 <= A < 4 and 1/2 <= X < 1.
45 The error in the approximate result with respect to the true
46 value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
48 Note: x and a are left-aligned, i.e., the most significant bit of
49 a[an-1] is set, and so is the most significant bit of the output x[n-1].
51 If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
52 A are taken into account to compute the approximation of 1/sqrt(A), but
53 whether or not they are zero, the error between X and 1/sqrt(A) is bounded
54 by 1 ulp(X) [in precision p].
55 The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
56 are set to 0.
58 Assumptions:
59 (1) A should be normalized, i.e., the most significant bit of a[an-1]
60 should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
61 (2) p >= 12
62 (3) {a, an} and {x, n} should not overlap
63 (4) GMP_NUMB_BITS >= 12 and is even
65 Note: this routine is much more efficient when ap is small compared to p,
66 including the case where ap <= GMP_NUMB_BITS, thus it can be used to
67 implement an efficient mpfr_rec_sqrt_ui function.
69 Reference: Modern Computer Algebra, Richard Brent and Paul Zimmermann,
70 http://www.loria.fr/~zimmerma/mca/pub226.html
72 static void
73 mpfr_mpn_rec_sqrt (mp_ptr x, mp_prec_t p,
74 mp_srcptr a, mp_prec_t ap, int as)
77 /* the following T1 and T2 are bipartite tables giving initial
78 approximation for the inverse square root, with 13-bit input split in
79 5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
80 with i = a*2^8 + b*2^4 + c, we use for approximation of
81 2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
82 The largest error is obtained for i = 2054, where x = 2044,
83 and 2048/sqrt(i/2048) = 2045.006576...
85 static short int T1[384] = {
86 2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
87 1944, 1938, 1931, /* a=8 */
88 1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
89 1844, 1838, 1832, /* a=9 */
90 1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
91 1757, 1752, 1747, /* a=10 */
92 1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
93 1681, 1677, 1673, /* a=11 */
94 1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
95 1615, 1611, 1607, /* a=12 */
96 1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
97 1556, 1552, 1549, /* a=13 */
98 1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
99 1502, 1499, 1496, /* a=14 */
100 1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
101 1454, 1451, 1449, /* a=15 */
102 1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
103 1411, 1408, 1405, /* a=16 */
104 1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
105 1371, 1368, 1366, /* a=17 */
106 1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
107 1333, 1331, 1329, /* a=18 */
108 1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
109 1300, 1298, 1296, /* a=19 */
110 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
111 1268, 1266, 1265, /* a=20 */
112 1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
113 1239, 1237, 1235, /* a=21 */
114 1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
115 1212, 1210, 1208, /* a=22 */
116 1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
117 1185, 1184, 1182, /* a=23 */
118 1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
119 1162, 1160, 1159, /* a=24 */
120 1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
121 1139, 1137, 1136, /* a=25 */
122 1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
123 1117, 1116, 1115, /* a=26 */
124 1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
125 1098, 1096, 1095, /* a=27 */
126 1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
127 1078, 1077, 1076, /* a=28 */
128 1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
129 1060, 1059, 1058, /* a=29 */
130 1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
131 1043, 1042, 1041, /* a=30 */
132 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
133 1027, 1026, 1025 /* a=31 */
135 static unsigned char T2[384] = {
136 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
137 6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
138 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
139 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
140 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
141 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
142 3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
143 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
144 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
145 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
146 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
147 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
148 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
149 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
150 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
151 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
152 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
155 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
157 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
158 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* a=31 */
161 mp_size_t n = LIMB_SIZE(p); /* number of limbs of X */
162 mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
164 /* A should be normalized */
165 MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
166 /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
167 Since that does not depend on MPFR, we always check this. */
168 MPFR_ASSERTN((GMP_NUMB_BITS >= 12) && ((GMP_NUMB_BITS & 1) == 0));
169 /* {a, an} and {x, n} should not overlap */
170 MPFR_ASSERTD((a + an <= x) || (x + n <= a));
171 MPFR_ASSERTD(p >= 11);
173 if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
175 a += an - n;
176 an = n;
179 if (p == 11) /* should happen only from recursive calls */
181 unsigned long i, ab, ac;
182 mp_limb_t t;
184 /* take the 12+as most significant bits of A */
185 i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
186 /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
187 ab = i >> 4;
188 ac = (ab & 0x3F0) | (i & 0x0F);
189 t = (mp_limb_t) T1[ab - 0x80] + (mp_limb_t) T2[ac - 0x80];
190 x[0] = t << (GMP_NUMB_BITS - p);
192 else /* p >= 12 */
194 mp_prec_t h, pl;
195 mp_ptr r, s, t, u;
196 mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
197 mp_limb_t neg, cy, cu;
198 MPFR_TMP_DECL(marker);
200 /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
201 h = (p < 18) ? 11 : (p >> 1) + 2;
203 xn = LIMB_SIZE(h); /* limb size of the recursive Xh */
204 rn = LIMB_SIZE(2 * h); /* a priori limb size of Xh^2 */
205 ln = n - xn; /* remaining limbs to be computed */
207 /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
208 we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
209 thus the h-3 most significant bits of t should be zero,
210 which is in fact h+1+as-3 because of the normalization of A.
211 This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs. */
212 th = (h + 1 + as - 3) >> MPFR_LOG2_BITS_PER_MP_LIMB;
213 tn = LIMB_SIZE(2 * h + 1 + as);
215 /* we need h+1+as bits of a */
216 ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
217 needed for the recursive call*/
218 if (MPFR_UNLIKELY(ahn > an))
219 ahn = an;
220 mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
221 /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
222 limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
224 MPFR_TMP_MARK (marker);
225 /* first step: square X in r, result is exact */
226 un = xn + (tn - th);
227 /* We use the same temporary buffer to store r and u: r needs 2*xn
228 limbs where u needs xn+(tn-th) limbs. Since tn can store at least
229 2h bits, and th at most h bits, then tn-th can store at least h bits,
230 thus tn - th >= xn, and reserving the space for u is enough. */
231 MPFR_ASSERTD(2 * xn <= un);
232 u = r = (mp_ptr) MPFR_TMP_ALLOC (un * sizeof (mp_limb_t));
233 if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
234 thus ln = 0 */
236 MPFR_ASSERTD(ln == 0);
237 cy = x[0] >> (GMP_NUMB_BITS >> 1);
238 r ++;
239 r[0] = cy * cy;
241 else if (xn == 1) /* xn=1, rn=2 */
242 umul_ppmm(r[1], r[0], x[ln], x[ln]);
243 else
245 mpn_mul_n (r, x + ln, x + ln, xn);
246 if (rn < 2 * xn)
247 r ++;
249 /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
250 limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
252 /* Second step: s <- A * (r^2), and truncate the low ap bits,
253 i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
255 sn = an + rn;
256 s = (mp_ptr) MPFR_TMP_ALLOC (sn * sizeof (mp_limb_t));
257 if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
258 and 2h >= p+3 */
260 /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
261 bits from A */
262 /* since n=1, and we ensured an <= n, we also have an=1 */
263 MPFR_ASSERTD(an == 1);
264 umul_ppmm (s[1], s[0], r[0], a[0]);
266 else
268 /* we have p <= n * GMP_NUMB_BITS
269 2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
270 thus n <= rn <= n + 1 */
271 MPFR_ASSERTD(rn <= n + 1);
272 /* since we ensured an <= n, we have an <= rn */
273 MPFR_ASSERTD(an <= rn);
274 mpn_mul (s, r, rn, a, an);
275 /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
276 100000... or 011111... if as=0, or
277 010000... or 001111... if as=1.
278 We ignore the bits of s after the first 2h+1+as ones.
282 /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
283 limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
284 t = s + sn - tn; /* pointer to low limb of the high part of t */
285 /* the upper h-3 bits of 1-t should be zero,
286 where 1 corresponds to the most significant bit of t[tn-1] if as=0,
287 and to the 2nd most significant bit of t[tn-1] if as=1 */
289 /* compute t <- 1 - t, which is B^tn - {t, tn+1},
290 with rounding towards -Inf, i.e., rounding the input t towards +Inf.
291 We could only modify the low tn - th limbs from t, but it gives only
292 a small speedup, and would make the code more complex.
294 neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
295 if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
296 is the part truncated above, thus 1 - t rounded to -Inf
297 is 1 - th - ulp(th) */
299 /* since the 1+as most significant bits of t are zero, set them
300 to 1 before the one-complement */
301 t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
302 MPFR_COM_N (t, t, tn);
303 /* we should add 1 here to get 1-th complement, and subtract 1 for
304 -ulp(th), thus we do nothing */
306 else /* negative case: we want 1 - t rounded towards -Inf, i.e.,
307 th + eps rounded towards +Inf, which is th + ulp(th):
308 we discard the bit corresponding to 1,
309 and we add 1 to the least significant bit of t */
311 t[tn - 1] ^= neg;
312 mpn_add_1 (t, t, tn, 1);
314 tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
315 the high limbs of {t, tn} are zero */
317 /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
318 th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
319 MPFR_ASSERTD(tn > 0);
321 /* u <- x * t, where {t, tn} contains at least h+3 bits,
322 and {x, xn} contains h bits, thus tn >= xn */
323 MPFR_ASSERTD(tn >= xn);
324 if (tn == 1) /* necessarily xn=1 */
325 umul_ppmm (u[1], u[0], t[0], x[ln]);
326 else
327 mpn_mul (u, t, tn, x + ln, xn);
329 /* we have already discarded the upper th high limbs of t, thus we only
330 have to consider the upper n - th limbs of u */
331 un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
332 h = ceil((p+3)/2) <= (p+4)/2,
333 th*GMP_NUMB_BITS <= h-1 <= p/2+1,
334 thus (n-th)*GMP_NUMB_BITS >= p/2-1.
336 MPFR_ASSERTD(un > 0);
337 u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
338 = xn + original_tn - n
339 = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
340 since 2h >= p+3 */
341 MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
343 /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
344 need to add or subtract.
345 In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
346 u by 2. */
348 if (as == 1)
349 /* shift on un+1 limbs to get most significant bit of u[-1] into
350 least significant bit of u[0] */
351 mpn_lshift (u - 1, u - 1, un + 1, 1);
353 pl = n * GMP_NUMB_BITS - p; /* low bits from x */
354 /* We want that the low pl bits are zero after rounding to nearest,
355 thus we round u to nearest at bit pl-1 of u[0] */
356 if (pl > 0)
358 cu = mpn_add_1 (u, u, un, u[0] & (MPFR_LIMB_ONE << (pl - 1)));
359 /* mask bits 0..pl-1 of u[0] */
360 u[0] &= ~MPFR_LIMB_MASK(pl);
362 else /* round bit is in u[-1] */
363 cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
365 /* We already have filled {x + ln, xn = n - ln}, and we want to add or
366 subtract cu*B^un + {u, un} at position x.
367 un = n - th, where th contains <= h+1+as-3<=h-1 bits
368 ln = n - xn, where xn contains >= h bits
369 thus un > ln.
370 Warning: ln might be zero.
372 MPFR_ASSERTD(un > ln);
373 /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
374 p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
375 MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
376 /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
377 we need to add or subtract the overlapping part {u + ln, un - ln} */
378 if (neg == 0)
380 if (ln > 0)
381 MPN_COPY (x, u, ln);
382 cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
383 /* add cu at x+un */
384 cy += mpn_add_1 (x + un, x + un, th, cu);
386 else /* negative case */
388 /* subtract {u+ln, un-ln} from {x+ln,un} */
389 cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
390 /* carry cy is at x+un, like cu */
391 cy = mpn_sub_1 (x + un, x + un, th, cy + cu); /* n - un = th */
392 /* cy cannot be zero, since the most significant bit of Xh is 1,
393 and the correction is bounded by 2^{-h+3} */
394 MPFR_ASSERTD(cy == 0);
395 if (ln > 0)
397 MPFR_COM_N (x, u, ln);
398 /* we must add one for the 2-complement ... */
399 cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
400 /* ... and subtract 1 at x[ln], where n = ln + xn */
401 cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
405 /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
406 have X = B^n, and setting X to 1-2^{-p} satisties the error bound
407 of 1 ulp. */
408 if (MPFR_UNLIKELY(cy != 0))
410 cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
411 MPFR_ASSERTD(cy == 0);
414 MPFR_TMP_FREE (marker);
419 mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mp_rnd_t rnd_mode)
421 mp_prec_t rp, up, wp;
422 mp_size_t rn, wn;
423 int s, cy, inex;
424 mp_ptr x;
425 int out_of_place;
426 MPFR_TMP_DECL(marker);
428 MPFR_LOG_FUNC (("x[%#R]=%R rnd=%d", u, u, rnd_mode),
429 ("y[%#R]=%R inexact=%d", r, r, inex));
431 /* special values */
432 if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
434 if (MPFR_IS_NAN(u))
436 MPFR_SET_NAN(r);
437 MPFR_RET_NAN;
439 else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
441 /* 0+ or 0- */
442 MPFR_SET_INF(r);
443 MPFR_SET_POS(r);
444 MPFR_RET(0); /* Inf is exact */
446 else
448 MPFR_ASSERTD(MPFR_IS_INF(u));
449 /* 1/sqrt(-Inf) = NAN */
450 if (MPFR_IS_NEG(u))
452 MPFR_SET_NAN(r);
453 MPFR_RET_NAN;
455 /* 1/sqrt(+Inf) = +0 */
456 MPFR_SET_POS(r);
457 MPFR_SET_ZERO(r);
458 MPFR_RET(0);
462 /* if u < 0, 1/sqrt(u) is NaN */
463 if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
465 MPFR_SET_NAN(r);
466 MPFR_RET_NAN;
469 MPFR_CLEAR_FLAGS(r);
470 MPFR_SET_POS(r);
472 rp = MPFR_PREC(r); /* output precision */
473 up = MPFR_PREC(u); /* input precision */
474 wp = rp + 11; /* initial working precision */
476 /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
477 If e is even, we compute an approximation of X of (4U)^{-1/2},
478 and the result is X*2^(-(e-2)/2) [case s=1].
479 If e is odd, we compute an approximation of X of (2U)^{-1/2},
480 and the result is X*2^(-(e-1)/2) [case s=0]. */
482 /* parity of the exponent of u */
483 s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
485 rn = LIMB_SIZE(rp);
487 /* for the first iteration, if rp + 11 fits into rn limbs, we round up
488 up to a full limb to maximize the chance of rounding, while avoiding
489 to allocate extra space */
490 wp = rp + 11;
491 if (wp < rn * BITS_PER_MP_LIMB)
492 wp = rn * BITS_PER_MP_LIMB;
493 for (;;)
495 MPFR_TMP_MARK (marker);
496 wn = LIMB_SIZE(wp);
497 out_of_place = (r == u) || (wn > rn);
498 if (out_of_place)
499 x = (mp_ptr) MPFR_TMP_ALLOC (wn * sizeof (mp_limb_t));
500 else
501 x = MPFR_MANT(r);
502 mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
503 /* If the input was not truncated, the error is at most one ulp;
504 if the input was truncated, the error is at most two ulps
505 (see algorithms.tex). */
506 if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
507 rp + (rnd_mode == GMP_RNDN))))
508 break;
510 /* We detect only now the exact case where u=2^(2e), to avoid
511 slowing down the average case. This can happen only when the
512 mantissa is exactly 1/2 and the exponent is odd. */
513 if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
515 mp_prec_t pl = wn * BITS_PER_MP_LIMB - wp;
517 /* we should have x=111...111 */
518 mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
519 x[wn - 1] = MPFR_LIMB_HIGHBIT;
520 s += 2;
521 break; /* go through */
523 MPFR_TMP_FREE(marker);
525 wp += BITS_PER_MP_LIMB;
527 cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
528 MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
529 if (MPFR_UNLIKELY(cy != 0))
531 MPFR_EXP(r) ++;
532 MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
534 MPFR_TMP_FREE(marker);
535 return mpfr_check_range (r, inex, rnd_mode);