1 /* Copyright (C) 2007-2016 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 3, or (at your option) any later
10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
24 #ifndef _SQRT_MACROS_H_
25 #define _SQRT_MACROS_H_
29 #if DOUBLE_EXTENDED_ON
31 extern BINARY80
SQRT80 (BINARY80
);
35 short_sqrt128 (UINT128 A10
) {
41 l64
= (BINARY80
) f64
.d
;
42 lx
= (BINARY80
) A10
.w
[1] * l64
+ (BINARY80
) A10
.w
[0];
49 long_sqrt128 (UINT128
* pCS
, UINT256 C256
) {
54 BINARY80 l64
, lm64
, l128
, lxL
, lx
, ly
, lS
, lSH
, lSL
, lE
, l3
, l2
,
56 int_float fx
, f64
, fm64
;
57 int *ple
= (int *) &lx
;
61 l64
= (BINARY80
) f64
.d
;
64 lx
= l3
= (BINARY80
) C256
.w
[3] * l64
* l128
;
65 l2
= (BINARY80
) C256
.w
[2] * l128
;
67 l1
= (BINARY80
) C256
.w
[1] * l64
;
69 l0
= (BINARY80
) C256
.w
[0];
77 lm64
= (BINARY80
) fm64
.d
;
78 CS
.w
[1] = (UINT64
) (lS
* lm64
);
79 CS
.w
[0] = (UINT64
) (lS
- (BINARY80
) CS
.w
[1] * l64
);
81 ///////////////////////////////////////
83 // little endian code only
84 // add solution for big endian
85 //////////////////////////////////////
87 *((UINT64
*) & lSH
) &= 0xffffffff00000000ull
;
89 // correction for C256 rounding
90 lCl
= FENCE (l3
- lx
);
91 lCl
= FENCE (lCl
+ l2
);
92 lCl
= FENCE (lCl
+ l1
);
93 lCl
= FENCE (lCl
+ l0
);
97 //////////////////////////////////////////
98 // Watch for compiler re-ordering
100 /////////////////////////////////////////
102 lxL
= FENCE (lx
- lSH
* lSH
);
105 lxL
= FENCE (lxL
- lp
);
107 lxL
= FENCE (lxL
- lSL
);
111 lE
= lCl
/ (lS
+ lS
);
113 // get low part of coefficient
133 extern double sqrt (double);
135 __BID_INLINE__ UINT64
136 short_sqrt128 (UINT128 A10
) {
137 UINT256 ARS
, ARS0
, AE0
, AE
, S
;
145 f64
.i
= 0x43f0000000000000ull
;
147 lx
= (double) A10
.w
[1] * l64
+ (double) A10
.w
[0];
148 ly
.d
= 1.0 / sqrt (lx
);
150 MY
= (ly
.i
& 0x000fffffffffffffull
) | 0x0010000000000000ull
;
151 ey
= 0x3ff - (ly
.i
>> 52);
154 __mul_64x128_to_192 (ARS0
, MY
, A10
);
155 __mul_64x192_to_256 (ARS
, MY
, ARS0
);
157 // shr by 2*ey+40, to get a 64-bit value
158 k
= (ey
<< 1) + 104 - 64;
161 ES
= (ARS
.w
[2] >> (k
- 128)) | (ARS
.w
[3] << (192 - k
));
171 __shr_128 (ARS
, ARS
, k
);
176 ES
= ((SINT64
) ES
) >> 1;
178 if (((SINT64
) ES
) < 0) {
181 // A*RS*eps (scaled by 2^64)
182 __mul_64x192_to_256 (AE0
, ES
, ARS0
);
188 __add_carry_out (S
.w
[0], CY
, ARS0
.w
[0], AE
.w
[0]);
189 __add_carry_in_out (S
.w
[1], CY
, ARS0
.w
[1], AE
.w
[1], CY
);
190 S
.w
[2] = ARS0
.w
[2] + AE
.w
[2] + CY
;
192 // A*RS*eps (scaled by 2^64)
193 __mul_64x192_to_256 (AE0
, ES
, ARS0
);
199 __sub_borrow_out (S
.w
[0], CY
, ARS0
.w
[0], AE
.w
[0]);
200 __sub_borrow_in_out (S
.w
[1], CY
, ARS0
.w
[1], AE
.w
[1], CY
);
201 S
.w
[2] = ARS0
.w
[2] - AE
.w
[2] - CY
;
222 return (UINT64
) ((S
.w
[0] + 1) >> 1);
229 long_sqrt128 (UINT128
* pCS
, UINT256 C256
) {
231 UINT256 ARS00
, AE
, AE2
, S
;
232 UINT128 ES
, ES2
, ARS1
;
234 double l64
, l128
, lx
, l2
, l1
, l0
;
239 f64
.i
= 0x43f0000000000000ull
;
243 lx
= (double) C256
.w
[3] * l64
* l128
;
244 l2
= (double) C256
.w
[2] * l128
;
245 lx
= FENCE (lx
+ l2
);
246 l1
= (double) C256
.w
[1] * l64
;
247 lx
= FENCE (lx
+ l1
);
248 l0
= (double) C256
.w
[0];
249 lx
= FENCE (lx
+ l0
);
251 ly
.d
= 1.0 / sqrt (lx
);
253 MY
= (ly
.i
& 0x000fffffffffffffull
) | 0x0010000000000000ull
;
254 ey
= 0x3ff - (ly
.i
>> 52);
256 // A10*RS^2, scaled by 2^(2*ey+104)
257 __mul_64x256_to_320 (ARS0
, MY
, C256
);
258 __mul_64x320_to_384 (ARS
, MY
, ARS0
);
260 // shr by k=(2*ey+104)-128
261 // expect k is in the range (192, 256) if result in [10^33, 10^34)
262 // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
263 k
= (ey
<< 1) + 104 - 128 - 192;
265 ES
.w
[0] = (ARS
.w
[3] >> (k
+ 1)) | (ARS
.w
[4] << (k2
- 1));
266 ES
.w
[1] = (ARS
.w
[4] >> k
) | (ARS
.w
[5] << k2
);
267 ES
.w
[1] = ((SINT64
) ES
.w
[1]) >> 1;
269 // A*RS >> 192 (for error term computation)
270 ARS1
.w
[0] = ARS0
.w
[3];
271 ARS1
.w
[1] = ARS0
.w
[4];
274 ARS00
.w
[0] = ARS0
.w
[1];
275 ARS00
.w
[1] = ARS0
.w
[2];
276 ARS00
.w
[2] = ARS0
.w
[3];
277 ARS00
.w
[3] = ARS0
.w
[4];
279 if (((SINT64
) ES
.w
[1]) < 0) {
286 __mul_128x128_to_256 (AE
, ES
, ARS1
);
288 __add_carry_out (S
.w
[0], CY
, ARS00
.w
[0], AE
.w
[0]);
289 __add_carry_in_out (S
.w
[1], CY
, ARS00
.w
[1], AE
.w
[1], CY
);
290 __add_carry_in_out (S
.w
[2], CY
, ARS00
.w
[2], AE
.w
[2], CY
);
291 S
.w
[3] = ARS00
.w
[3] + AE
.w
[3] + CY
;
294 __mul_128x128_to_256 (AE
, ES
, ARS1
);
296 __sub_borrow_out (S
.w
[0], CY
, ARS00
.w
[0], AE
.w
[0]);
297 __sub_borrow_in_out (S
.w
[1], CY
, ARS00
.w
[1], AE
.w
[1], CY
);
298 __sub_borrow_in_out (S
.w
[2], CY
, ARS00
.w
[2], AE
.w
[2], CY
);
299 S
.w
[3] = ARS00
.w
[3] - AE
.w
[3] - CY
;
302 // 3/2*eps^2, scaled by 2^128
303 ES32
= ES
.w
[1] + (ES
.w
[1] >> 1);
304 __mul_64x64_to_128 (ES2
, ES32
, ES
.w
[1]);
306 __mul_128x128_to_256 (AE2
, ES2
, ARS1
);
308 // result, scaled by 2^(ey+52-64)
309 __add_carry_out (S
.w
[0], CY
, S
.w
[0], AE2
.w
[0]);
310 __add_carry_in_out (S
.w
[1], CY
, S
.w
[1], AE2
.w
[1], CY
);
311 __add_carry_in_out (S
.w
[2], CY
, S
.w
[2], AE2
.w
[2], CY
);
312 S
.w
[3] = S
.w
[3] + AE2
.w
[3] + CY
;
317 S
.w
[0] = (S
.w
[1] >> k
) | (S
.w
[2] << k2
);
318 S
.w
[1] = (S
.w
[2] >> k
) | (S
.w
[3] << k2
);
325 pCS
->w
[0] = (S
.w
[1] << 63) | (S
.w
[0] >> 1);
326 pCS
->w
[1] = S
.w
[1] >> 1;