2007-09-27 H.J. Lu <hongjiu.lu@intel.com>
[official-gcc.git] / libgcc / config / libbid / bid64_sqrt.c
blob1098ede90df7cefdba5fdaff1cb53f9e89cfd620
1 /* Copyright (C) 2007 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 2, or (at your option) any later
8 version.
10 In addition to the permissions in the GNU General Public License, the
11 Free Software Foundation gives you unlimited permission to link the
12 compiled version of this file into combinations with other programs,
13 and to distribute those combinations without any restriction coming
14 from the use of this file. (The General Public License restrictions
15 do apply in other respects; for example, they cover modification of
16 the file, and distribution when not linked into a combine
17 executable.)
19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
21 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
22 for more details.
24 You should have received a copy of the GNU General Public License
25 along with GCC; see the file COPYING. If not, write to the Free
26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
27 02110-1301, USA. */
29 /*****************************************************************************
30 * BID64 square root
31 *****************************************************************************
33 * Algorithm description:
35 * if(exponent_x is odd)
36 * scale coefficient_x by 10, adjust exponent
37 * - get lower estimate for number of digits in coefficient_x
38 * - scale coefficient x to between 31 and 33 decimal digits
39 * - in parallel, check for exact case and return if true
40 * - get high part of result coefficient using double precision sqrt
41 * - compute remainder and refine coefficient in one iteration (which
42 * modifies it by at most 1)
43 * - result exponent is easy to compute from the adjusted arg. exponent
45 ****************************************************************************/
47 #include "bid_internal.h"
48 #include "bid_sqrt_macros.h"
49 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
50 #include <fenv.h>
52 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
53 #endif
55 extern double sqrt (double);
57 #if DECIMAL_CALL_BY_REFERENCE
59 void
60 bid64_sqrt (UINT64 * pres,
61 UINT64 *
62 px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
63 _EXC_INFO_PARAM) {
64 UINT64 x;
65 #else
67 UINT64
68 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
69 _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
70 #endif
71 UINT128 CA, CT;
72 UINT64 sign_x, coefficient_x;
73 UINT64 Q, Q2, A10, C4, R, R2, QE, res;
74 SINT64 D;
75 int_double t_scale;
76 int_float tempx;
77 double da, dq, da_h, da_l, dqe;
78 int exponent_x, exponent_q, bin_expon_cx;
79 int digits_x;
80 int scale;
81 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
82 fexcept_t binaryflags = 0;
83 #endif
85 #if DECIMAL_CALL_BY_REFERENCE
86 #if !DECIMAL_GLOBAL_ROUNDING
87 _IDEC_round rnd_mode = *prnd_mode;
88 #endif
89 x = *px;
90 #endif
92 // unpack arguments, check for NaN or Infinity
93 if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
94 // x is Inf. or NaN or 0
95 if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
96 res = coefficient_x;
97 if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64) // -Infinity
99 res = NAN_MASK64;
100 #ifdef SET_STATUS_FLAGS
101 __set_status_flags (pfpsf, INVALID_EXCEPTION);
102 #endif
104 #ifdef SET_STATUS_FLAGS
105 if ((x & SNAN_MASK64) == SNAN_MASK64) // sNaN
106 __set_status_flags (pfpsf, INVALID_EXCEPTION);
107 #endif
108 BID_RETURN (res & QUIET_MASK64);
110 // x is 0
111 exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
112 res = sign_x | (((UINT64) exponent_x) << 53);
113 BID_RETURN (res);
115 // x<0?
116 if (sign_x && coefficient_x) {
117 res = NAN_MASK64;
118 #ifdef SET_STATUS_FLAGS
119 __set_status_flags (pfpsf, INVALID_EXCEPTION);
120 #endif
121 BID_RETURN (res);
123 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
124 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
125 #endif
126 //--- get number of bits in the coefficient of x ---
127 tempx.d = (float) coefficient_x;
128 bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
129 digits_x = estimate_decimal_digits[bin_expon_cx];
130 // add test for range
131 if (coefficient_x >= power10_index_binexp[bin_expon_cx])
132 digits_x++;
134 A10 = coefficient_x;
135 if (exponent_x & 1) {
136 A10 = (A10 << 2) + A10;
137 A10 += A10;
140 dqe = sqrt ((double) A10);
141 QE = (UINT32) dqe;
142 if (QE * QE == A10) {
143 res =
144 very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
145 QE);
146 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
147 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
148 #endif
149 BID_RETURN (res);
151 // if exponent is odd, scale coefficient by 10
152 scale = 31 - digits_x;
153 exponent_q = exponent_x - scale;
154 scale += (exponent_q & 1); // exp. bias is even
156 CT = power10_table_128[scale];
157 __mul_64x128_short (CA, coefficient_x, CT);
159 // 2^64
160 t_scale.i = 0x43f0000000000000ull;
161 // convert CA to DP
162 da_h = CA.w[1];
163 da_l = CA.w[0];
164 da = da_h * t_scale.d + da_l;
166 dq = sqrt (da);
168 Q = (UINT64) dq;
170 // get sign(sqrt(CA)-Q)
171 R = CA.w[0] - Q * Q;
172 R = ((SINT64) R) >> 63;
173 D = R + R + 1;
175 exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
177 #ifdef SET_STATUS_FLAGS
178 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
179 #endif
181 #ifndef IEEE_ROUND_NEAREST
182 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
183 if (!((rnd_mode) & 3)) {
184 #endif
185 #endif
187 // midpoint to check
188 Q2 = Q + Q + D;
189 C4 = CA.w[0] << 2;
191 // get sign(-sqrt(CA)+Midpoint)
192 R2 = Q2 * Q2 - C4;
193 R2 = ((SINT64) R2) >> 63;
195 // adjust Q if R!=R2
196 Q += (D & (R ^ R2));
197 #ifndef IEEE_ROUND_NEAREST
198 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
199 } else {
200 C4 = CA.w[0];
201 Q += D;
202 if ((SINT64) (Q * Q - C4) > 0)
203 Q--;
204 if (rnd_mode == ROUNDING_UP)
205 Q++;
207 #endif
208 #endif
210 res = fast_get_BID64 (0, exponent_q, Q);
211 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
212 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
213 #endif
214 BID_RETURN (res);
218 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
220 UINT256 M256, C4, C8;
221 UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
222 mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
223 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
224 SINT64 D;
225 int_float fx, f64;
226 int exponent_x, bin_expon_cx, done = 0;
227 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
228 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
229 fexcept_t binaryflags = 0;
230 #endif
232 // unpack arguments, check for NaN or Infinity
233 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
234 res = CX.w[1];
235 // NaN ?
236 if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
237 #ifdef SET_STATUS_FLAGS
238 if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull) // sNaN
239 __set_status_flags (pfpsf, INVALID_EXCEPTION);
240 #endif
241 Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
242 Tmp.w[0] = CX.w[0];
243 TP128 = reciprocals10_128[18];
244 __mul_128x128_full (Qh, Ql, Tmp, TP128);
245 amount = recip_scale[18];
246 __shr_128 (Tmp, Qh, amount);
247 res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
248 BID_RETURN (res);
250 // x is Infinity?
251 if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
252 if (sign_x) {
253 // -Inf, return NaN
254 res = 0x7c00000000000000ull;
255 #ifdef SET_STATUS_FLAGS
256 __set_status_flags (pfpsf, INVALID_EXCEPTION);
257 #endif
259 BID_RETURN (res);
261 // x is 0 otherwise
263 exponent_x =
264 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
265 DECIMAL_EXPONENT_BIAS;
266 if (exponent_x < 0)
267 exponent_x = 0;
268 if (exponent_x > DECIMAL_MAX_EXPON_64)
269 exponent_x = DECIMAL_MAX_EXPON_64;
270 //res= sign_x | (((UINT64)exponent_x)<<53);
271 res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
272 BID_RETURN (res);
274 if (sign_x) {
275 res = 0x7c00000000000000ull;
276 #ifdef SET_STATUS_FLAGS
277 __set_status_flags (pfpsf, INVALID_EXCEPTION);
278 #endif
279 BID_RETURN (res);
281 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
282 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
283 #endif
285 // 2^64
286 f64.i = 0x5f800000;
288 // fx ~ CX
289 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
290 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
291 digits = estimate_decimal_digits[bin_expon_cx];
293 A10 = CX;
294 if (exponent_x & 1) {
295 A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
296 A10.w[0] = CX.w[0] << 3;
297 CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
298 CX2.w[0] = CX.w[0] << 1;
299 __add_128_128 (A10, A10, CX2);
302 C256.w[1] = A10.w[1];
303 C256.w[0] = A10.w[0];
304 CS.w[0] = short_sqrt128 (A10);
305 CS.w[1] = 0;
306 mul_factor = 0;
307 // check for exact result
308 if (CS.w[0] < 10000000000000000ull) {
309 if (CS.w[0] * CS.w[0] == A10.w[0]) {
310 __sqr64_fast (S2, CS.w[0]);
311 if (S2.w[1] == A10.w[1]) // && S2.w[0]==A10.w[0])
313 res =
314 get_BID64 (0,
315 ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
316 DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
317 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
318 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
319 #endif
320 BID_RETURN (res);
323 if (CS.w[0] >= 1000000000000000ull) {
324 done = 1;
325 exponent_q = exponent_x;
326 C256.w[1] = A10.w[1];
327 C256.w[0] = A10.w[0];
329 #ifdef SET_STATUS_FLAGS
330 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
331 #endif
332 exact = 0;
333 } else {
334 B10 = 0x3333333333333334ull;
335 __mul_64x64_to_128_full (CS2, CS.w[0], B10);
336 CS0 = CS2.w[1] >> 1;
337 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
338 #ifdef SET_STATUS_FLAGS
339 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
340 #endif
341 exact = 0;
343 done = 1;
344 CS.w[0] = CS0;
345 exponent_q = exponent_x + 2;
346 mul_factor = 10;
347 mul_factor2 = 100;
348 if (CS.w[0] >= 10000000000000000ull) {
349 __mul_64x64_to_128_full (CS2, CS.w[0], B10);
350 CS0 = CS2.w[1] >> 1;
351 if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
352 #ifdef SET_STATUS_FLAGS
353 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
354 #endif
355 exact = 0;
357 exponent_q += 2;
358 CS.w[0] = CS0;
359 mul_factor = 100;
360 mul_factor2 = 10000;
362 if (exact) {
363 CS0 = CS.w[0] * mul_factor;
364 __sqr64_fast (CS1, CS0)
365 if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
366 #ifdef SET_STATUS_FLAGS
367 __set_status_flags (pfpsf, INEXACT_EXCEPTION);
368 #endif
369 exact = 0;
374 if (!done) {
375 // get number of digits in CX
376 D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
377 if (D > 0
378 || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
379 digits++;
381 // if exponent is odd, scale coefficient by 10
382 scale = 31 - digits;
383 exponent_q = exponent_x - scale;
384 scale += (exponent_q & 1); // exp. bias is even
386 T128 = power10_table_128[scale];
387 __mul_128x128_low (C256, CX, T128);
390 CS.w[0] = short_sqrt128 (C256);
392 //printf("CS=%016I64x\n",CS.w[0]);
394 exponent_q =
395 ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
396 DECIMAL_EXPONENT_BIAS;
397 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
398 extra_digits = -exponent_q;
399 exponent_q = 0;
401 // get coeff*(2^M[extra_digits])/10^extra_digits
402 __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
404 // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
405 amount = short_recip_scale[extra_digits];
407 CS0 = QH.w[1] >> amount;
409 #ifdef SET_STATUS_FLAGS
410 if (exact) {
411 if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
412 exact = 0;
414 if (!exact)
415 __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
416 #endif
418 CS.w[0] = CS0;
419 if (!mul_factor)
420 mul_factor = 1;
421 mul_factor *= power10_table_128[extra_digits].w[0];
422 __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
423 if (mul_factor2_long.w[1])
424 mul_factor2 = 0;
425 else
426 mul_factor2 = mul_factor2_long.w[1];
428 // 4*C256
429 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
430 C4.w[0] = C256.w[0] << 2;
432 #ifndef IEEE_ROUND_NEAREST
433 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
434 if (!((rnd_mode) & 3)) {
435 #endif
436 #endif
437 // compare to midpoints
438 CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
439 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
440 if (mul_factor)
441 CSM.w[0] *= mul_factor;
442 // CSM^2
443 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
444 //__mul_128x128_to_256(M256, CSM, CSM);
446 if (C4.w[1] > M256.w[1] ||
447 (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
448 // round up
449 CS.w[0]++;
450 } else {
451 C8.w[0] = CS.w[0] << 3;
452 C8.w[1] = 0;
453 if (mul_factor) {
454 if (mul_factor2) {
455 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
456 } else {
457 __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
460 // M256 - 8*CSM
461 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
462 M256.w[1] = M256.w[1] - C8.w[1] - Carry;
464 // if CSM' > C256, round up
465 if (M256.w[1] > C4.w[1] ||
466 (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
467 // round down
468 if (CS.w[0])
469 CS.w[0]--;
472 #ifndef IEEE_ROUND_NEAREST
473 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
474 } else {
475 CS.w[0]++;
476 CSM.w[0] = CS.w[0];
477 C8.w[0] = CSM.w[0] << 1;
478 if (mul_factor)
479 CSM.w[0] *= mul_factor;
480 __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
481 C8.w[1] = 0;
482 if (mul_factor) {
483 if (mul_factor2) {
484 __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
485 } else {
486 __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
489 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
491 if (M256.w[1] > C256.w[1] ||
492 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
493 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
494 M256.w[1] = M256.w[1] - Carry - C8.w[1];
495 M256.w[0]++;
496 if (!M256.w[0]) {
497 M256.w[1]++;
501 if ((M256.w[1] > C256.w[1] ||
502 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
503 && (CS.w[0] > 1)) {
505 CS.w[0]--;
507 if (CS.w[0] > 1) {
508 __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
509 M256.w[1] = M256.w[1] - Carry - C8.w[1];
510 M256.w[0]++;
511 if (!M256.w[0]) {
512 M256.w[1]++;
515 if (M256.w[1] > C256.w[1] ||
516 (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
517 CS.w[0]--;
522 else {
523 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
524 M256.w[1] = M256.w[1] + Carry + C8.w[1];
525 M256.w[0]++;
526 if(!M256.w[0])
528 M256.w[1]++;
530 CS.w[0]++;
531 if(M256.w[1]<C256.w[1] ||
532 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
534 CS.w[0]++;
536 CS.w[0]++;
538 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
539 // RU?
540 if (((rnd_mode) != ROUNDING_UP) || exact) {
541 if (CS.w[0])
542 CS.w[0]--;
546 #endif
547 #endif
548 //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
550 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
551 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
552 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
553 #endif
554 BID_RETURN (res);