softfloat: Expand out the STATUS_PARAM macro
[qemu.git] / fpu / softfloat.c
blob6437b177496ab1ea4dd22efa31871c775a02563f
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "config.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
90 #include <stdlib.h>
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "softfloat-macros.h"
99 /*----------------------------------------------------------------------------
100 | Functions and definitions to determine: (1) whether tininess for underflow
101 | is detected before or after rounding by default, (2) what (if anything)
102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 | are propagated from function inputs to output. These details are target-
105 | specific.
106 *----------------------------------------------------------------------------*/
107 #include "softfloat-specialize.h"
109 /*----------------------------------------------------------------------------
110 | Returns the fraction bits of the half-precision floating-point value `a'.
111 *----------------------------------------------------------------------------*/
113 static inline uint32_t extractFloat16Frac(float16 a)
115 return float16_val(a) & 0x3ff;
118 /*----------------------------------------------------------------------------
119 | Returns the exponent bits of the half-precision floating-point value `a'.
120 *----------------------------------------------------------------------------*/
122 static inline int_fast16_t extractFloat16Exp(float16 a)
124 return (float16_val(a) >> 10) & 0x1f;
127 /*----------------------------------------------------------------------------
128 | Returns the sign bit of the single-precision floating-point value `a'.
129 *----------------------------------------------------------------------------*/
131 static inline flag extractFloat16Sign(float16 a)
133 return float16_val(a)>>15;
136 /*----------------------------------------------------------------------------
137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
139 | input. If `zSign' is 1, the input is negated before being converted to an
140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141 | is simply rounded to an integer, with the inexact exception raised if the
142 | input cannot be represented exactly as an integer. However, if the fixed-
143 | point input is too large, the invalid exception is raised and the largest
144 | positive or negative integer is returned.
145 *----------------------------------------------------------------------------*/
147 static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
149 int8 roundingMode;
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
152 int32_t z;
154 roundingMode = STATUS(float_rounding_mode);
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
156 switch (roundingMode) {
157 case float_round_nearest_even:
158 case float_round_ties_away:
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
179 float_raise( float_flag_invalid STATUS_VAR);
180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
183 return z;
187 /*----------------------------------------------------------------------------
188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
190 | and returns the properly rounded 64-bit integer corresponding to the input.
191 | If `zSign' is 1, the input is negated before being converted to an integer.
192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
193 | the inexact exception raised if the input cannot be represented exactly as
194 | an integer. However, if the fixed-point input is too large, the invalid
195 | exception is raised and the largest positive or negative integer is
196 | returned.
197 *----------------------------------------------------------------------------*/
199 static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
200 float_status *status)
202 int8 roundingMode;
203 flag roundNearestEven, increment;
204 int64_t z;
206 roundingMode = STATUS(float_rounding_mode);
207 roundNearestEven = ( roundingMode == float_round_nearest_even );
208 switch (roundingMode) {
209 case float_round_nearest_even:
210 case float_round_ties_away:
211 increment = ((int64_t) absZ1 < 0);
212 break;
213 case float_round_to_zero:
214 increment = 0;
215 break;
216 case float_round_up:
217 increment = !zSign && absZ1;
218 break;
219 case float_round_down:
220 increment = zSign && absZ1;
221 break;
222 default:
223 abort();
225 if ( increment ) {
226 ++absZ0;
227 if ( absZ0 == 0 ) goto overflow;
228 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230 z = absZ0;
231 if ( zSign ) z = - z;
232 if ( z && ( ( z < 0 ) ^ zSign ) ) {
233 overflow:
234 float_raise( float_flag_invalid STATUS_VAR);
235 return
236 zSign ? (int64_t) LIT64( 0x8000000000000000 )
237 : LIT64( 0x7FFFFFFFFFFFFFFF );
239 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
240 return z;
244 /*----------------------------------------------------------------------------
245 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
246 | `absZ1', with binary point between bits 63 and 64 (between the input words),
247 | and returns the properly rounded 64-bit unsigned integer corresponding to the
248 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
249 | with the inexact exception raised if the input cannot be represented exactly
250 | as an integer. However, if the fixed-point input is too large, the invalid
251 | exception is raised and the largest unsigned integer is returned.
252 *----------------------------------------------------------------------------*/
254 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
255 uint64_t absZ1, float_status *status)
257 int8 roundingMode;
258 flag roundNearestEven, increment;
260 roundingMode = STATUS(float_rounding_mode);
261 roundNearestEven = (roundingMode == float_round_nearest_even);
262 switch (roundingMode) {
263 case float_round_nearest_even:
264 case float_round_ties_away:
265 increment = ((int64_t)absZ1 < 0);
266 break;
267 case float_round_to_zero:
268 increment = 0;
269 break;
270 case float_round_up:
271 increment = !zSign && absZ1;
272 break;
273 case float_round_down:
274 increment = zSign && absZ1;
275 break;
276 default:
277 abort();
279 if (increment) {
280 ++absZ0;
281 if (absZ0 == 0) {
282 float_raise(float_flag_invalid STATUS_VAR);
283 return LIT64(0xFFFFFFFFFFFFFFFF);
285 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
288 if (zSign && absZ0) {
289 float_raise(float_flag_invalid STATUS_VAR);
290 return 0;
293 if (absZ1) {
294 STATUS(float_exception_flags) |= float_flag_inexact;
296 return absZ0;
299 /*----------------------------------------------------------------------------
300 | Returns the fraction bits of the single-precision floating-point value `a'.
301 *----------------------------------------------------------------------------*/
303 static inline uint32_t extractFloat32Frac( float32 a )
306 return float32_val(a) & 0x007FFFFF;
310 /*----------------------------------------------------------------------------
311 | Returns the exponent bits of the single-precision floating-point value `a'.
312 *----------------------------------------------------------------------------*/
314 static inline int_fast16_t extractFloat32Exp(float32 a)
317 return ( float32_val(a)>>23 ) & 0xFF;
321 /*----------------------------------------------------------------------------
322 | Returns the sign bit of the single-precision floating-point value `a'.
323 *----------------------------------------------------------------------------*/
325 static inline flag extractFloat32Sign( float32 a )
328 return float32_val(a)>>31;
332 /*----------------------------------------------------------------------------
333 | If `a' is denormal and we are in flush-to-zero mode then set the
334 | input-denormal exception and return zero. Otherwise just return the value.
335 *----------------------------------------------------------------------------*/
336 float32 float32_squash_input_denormal(float32 a, float_status *status)
338 if (STATUS(flush_inputs_to_zero)) {
339 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
340 float_raise(float_flag_input_denormal STATUS_VAR);
341 return make_float32(float32_val(a) & 0x80000000);
344 return a;
347 /*----------------------------------------------------------------------------
348 | Normalizes the subnormal single-precision floating-point value represented
349 | by the denormalized significand `aSig'. The normalized exponent and
350 | significand are stored at the locations pointed to by `zExpPtr' and
351 | `zSigPtr', respectively.
352 *----------------------------------------------------------------------------*/
354 static void
355 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
357 int8 shiftCount;
359 shiftCount = countLeadingZeros32( aSig ) - 8;
360 *zSigPtr = aSig<<shiftCount;
361 *zExpPtr = 1 - shiftCount;
365 /*----------------------------------------------------------------------------
366 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
367 | single-precision floating-point value, returning the result. After being
368 | shifted into the proper positions, the three fields are simply added
369 | together to form the result. This means that any integer portion of `zSig'
370 | will be added into the exponent. Since a properly normalized significand
371 | will have an integer portion equal to 1, the `zExp' input should be 1 less
372 | than the desired result exponent whenever `zSig' is a complete, normalized
373 | significand.
374 *----------------------------------------------------------------------------*/
376 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
379 return make_float32(
380 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 /*----------------------------------------------------------------------------
385 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
386 | and significand `zSig', and returns the proper single-precision floating-
387 | point value corresponding to the abstract input. Ordinarily, the abstract
388 | value is simply rounded and packed into the single-precision format, with
389 | the inexact exception raised if the abstract input cannot be represented
390 | exactly. However, if the abstract value is too large, the overflow and
391 | inexact exceptions are raised and an infinity or maximal finite value is
392 | returned. If the abstract value is too small, the input value is rounded to
393 | a subnormal number, and the underflow and inexact exceptions are raised if
394 | the abstract input cannot be represented exactly as a subnormal single-
395 | precision floating-point number.
396 | The input significand `zSig' has its binary point between bits 30
397 | and 29, which is 7 bits to the left of the usual location. This shifted
398 | significand must be normalized or smaller. If `zSig' is not normalized,
399 | `zExp' must be 0; in that case, the result returned is a subnormal number,
400 | and it must not require rounding. In the usual case that `zSig' is
401 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
402 | The handling of underflow and overflow follows the IEC/IEEE Standard for
403 | Binary Floating-Point Arithmetic.
404 *----------------------------------------------------------------------------*/
406 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
407 float_status *status)
409 int8 roundingMode;
410 flag roundNearestEven;
411 int8 roundIncrement, roundBits;
412 flag isTiny;
414 roundingMode = STATUS(float_rounding_mode);
415 roundNearestEven = ( roundingMode == float_round_nearest_even );
416 switch (roundingMode) {
417 case float_round_nearest_even:
418 case float_round_ties_away:
419 roundIncrement = 0x40;
420 break;
421 case float_round_to_zero:
422 roundIncrement = 0;
423 break;
424 case float_round_up:
425 roundIncrement = zSign ? 0 : 0x7f;
426 break;
427 case float_round_down:
428 roundIncrement = zSign ? 0x7f : 0;
429 break;
430 default:
431 abort();
432 break;
434 roundBits = zSig & 0x7F;
435 if ( 0xFD <= (uint16_t) zExp ) {
436 if ( ( 0xFD < zExp )
437 || ( ( zExp == 0xFD )
438 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
440 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
441 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
443 if ( zExp < 0 ) {
444 if (STATUS(flush_to_zero)) {
445 float_raise(float_flag_output_denormal STATUS_VAR);
446 return packFloat32(zSign, 0, 0);
448 isTiny =
449 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
450 || ( zExp < -1 )
451 || ( zSig + roundIncrement < 0x80000000 );
452 shift32RightJamming( zSig, - zExp, &zSig );
453 zExp = 0;
454 roundBits = zSig & 0x7F;
455 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
458 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
459 zSig = ( zSig + roundIncrement )>>7;
460 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
461 if ( zSig == 0 ) zExp = 0;
462 return packFloat32( zSign, zExp, zSig );
466 /*----------------------------------------------------------------------------
467 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
468 | and significand `zSig', and returns the proper single-precision floating-
469 | point value corresponding to the abstract input. This routine is just like
470 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
471 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
472 | floating-point exponent.
473 *----------------------------------------------------------------------------*/
475 static float32
476 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
477 float_status *status)
479 int8 shiftCount;
481 shiftCount = countLeadingZeros32( zSig ) - 1;
482 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
486 /*----------------------------------------------------------------------------
487 | Returns the fraction bits of the double-precision floating-point value `a'.
488 *----------------------------------------------------------------------------*/
490 static inline uint64_t extractFloat64Frac( float64 a )
493 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
497 /*----------------------------------------------------------------------------
498 | Returns the exponent bits of the double-precision floating-point value `a'.
499 *----------------------------------------------------------------------------*/
501 static inline int_fast16_t extractFloat64Exp(float64 a)
504 return ( float64_val(a)>>52 ) & 0x7FF;
508 /*----------------------------------------------------------------------------
509 | Returns the sign bit of the double-precision floating-point value `a'.
510 *----------------------------------------------------------------------------*/
512 static inline flag extractFloat64Sign( float64 a )
515 return float64_val(a)>>63;
519 /*----------------------------------------------------------------------------
520 | If `a' is denormal and we are in flush-to-zero mode then set the
521 | input-denormal exception and return zero. Otherwise just return the value.
522 *----------------------------------------------------------------------------*/
523 float64 float64_squash_input_denormal(float64 a, float_status *status)
525 if (STATUS(flush_inputs_to_zero)) {
526 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
527 float_raise(float_flag_input_denormal STATUS_VAR);
528 return make_float64(float64_val(a) & (1ULL << 63));
531 return a;
534 /*----------------------------------------------------------------------------
535 | Normalizes the subnormal double-precision floating-point value represented
536 | by the denormalized significand `aSig'. The normalized exponent and
537 | significand are stored at the locations pointed to by `zExpPtr' and
538 | `zSigPtr', respectively.
539 *----------------------------------------------------------------------------*/
541 static void
542 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
544 int8 shiftCount;
546 shiftCount = countLeadingZeros64( aSig ) - 11;
547 *zSigPtr = aSig<<shiftCount;
548 *zExpPtr = 1 - shiftCount;
552 /*----------------------------------------------------------------------------
553 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
554 | double-precision floating-point value, returning the result. After being
555 | shifted into the proper positions, the three fields are simply added
556 | together to form the result. This means that any integer portion of `zSig'
557 | will be added into the exponent. Since a properly normalized significand
558 | will have an integer portion equal to 1, the `zExp' input should be 1 less
559 | than the desired result exponent whenever `zSig' is a complete, normalized
560 | significand.
561 *----------------------------------------------------------------------------*/
563 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
566 return make_float64(
567 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
571 /*----------------------------------------------------------------------------
572 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
573 | and significand `zSig', and returns the proper double-precision floating-
574 | point value corresponding to the abstract input. Ordinarily, the abstract
575 | value is simply rounded and packed into the double-precision format, with
576 | the inexact exception raised if the abstract input cannot be represented
577 | exactly. However, if the abstract value is too large, the overflow and
578 | inexact exceptions are raised and an infinity or maximal finite value is
579 | returned. If the abstract value is too small, the input value is rounded to
580 | a subnormal number, and the underflow and inexact exceptions are raised if
581 | the abstract input cannot be represented exactly as a subnormal double-
582 | precision floating-point number.
583 | The input significand `zSig' has its binary point between bits 62
584 | and 61, which is 10 bits to the left of the usual location. This shifted
585 | significand must be normalized or smaller. If `zSig' is not normalized,
586 | `zExp' must be 0; in that case, the result returned is a subnormal number,
587 | and it must not require rounding. In the usual case that `zSig' is
588 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
589 | The handling of underflow and overflow follows the IEC/IEEE Standard for
590 | Binary Floating-Point Arithmetic.
591 *----------------------------------------------------------------------------*/
593 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
594 float_status *status)
596 int8 roundingMode;
597 flag roundNearestEven;
598 int_fast16_t roundIncrement, roundBits;
599 flag isTiny;
601 roundingMode = STATUS(float_rounding_mode);
602 roundNearestEven = ( roundingMode == float_round_nearest_even );
603 switch (roundingMode) {
604 case float_round_nearest_even:
605 case float_round_ties_away:
606 roundIncrement = 0x200;
607 break;
608 case float_round_to_zero:
609 roundIncrement = 0;
610 break;
611 case float_round_up:
612 roundIncrement = zSign ? 0 : 0x3ff;
613 break;
614 case float_round_down:
615 roundIncrement = zSign ? 0x3ff : 0;
616 break;
617 default:
618 abort();
620 roundBits = zSig & 0x3FF;
621 if ( 0x7FD <= (uint16_t) zExp ) {
622 if ( ( 0x7FD < zExp )
623 || ( ( zExp == 0x7FD )
624 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
626 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
627 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
629 if ( zExp < 0 ) {
630 if (STATUS(flush_to_zero)) {
631 float_raise(float_flag_output_denormal STATUS_VAR);
632 return packFloat64(zSign, 0, 0);
634 isTiny =
635 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
636 || ( zExp < -1 )
637 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
638 shift64RightJamming( zSig, - zExp, &zSig );
639 zExp = 0;
640 roundBits = zSig & 0x3FF;
641 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645 zSig = ( zSig + roundIncrement )>>10;
646 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
647 if ( zSig == 0 ) zExp = 0;
648 return packFloat64( zSign, zExp, zSig );
652 /*----------------------------------------------------------------------------
653 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
654 | and significand `zSig', and returns the proper double-precision floating-
655 | point value corresponding to the abstract input. This routine is just like
656 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
657 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
658 | floating-point exponent.
659 *----------------------------------------------------------------------------*/
661 static float64
662 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
663 float_status *status)
665 int8 shiftCount;
667 shiftCount = countLeadingZeros64( zSig ) - 1;
668 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
672 /*----------------------------------------------------------------------------
673 | Returns the fraction bits of the extended double-precision floating-point
674 | value `a'.
675 *----------------------------------------------------------------------------*/
677 static inline uint64_t extractFloatx80Frac( floatx80 a )
680 return a.low;
684 /*----------------------------------------------------------------------------
685 | Returns the exponent bits of the extended double-precision floating-point
686 | value `a'.
687 *----------------------------------------------------------------------------*/
689 static inline int32 extractFloatx80Exp( floatx80 a )
692 return a.high & 0x7FFF;
696 /*----------------------------------------------------------------------------
697 | Returns the sign bit of the extended double-precision floating-point value
698 | `a'.
699 *----------------------------------------------------------------------------*/
701 static inline flag extractFloatx80Sign( floatx80 a )
704 return a.high>>15;
708 /*----------------------------------------------------------------------------
709 | Normalizes the subnormal extended double-precision floating-point value
710 | represented by the denormalized significand `aSig'. The normalized exponent
711 | and significand are stored at the locations pointed to by `zExpPtr' and
712 | `zSigPtr', respectively.
713 *----------------------------------------------------------------------------*/
715 static void
716 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
718 int8 shiftCount;
720 shiftCount = countLeadingZeros64( aSig );
721 *zSigPtr = aSig<<shiftCount;
722 *zExpPtr = 1 - shiftCount;
726 /*----------------------------------------------------------------------------
727 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
728 | extended double-precision floating-point value, returning the result.
729 *----------------------------------------------------------------------------*/
731 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
733 floatx80 z;
735 z.low = zSig;
736 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
737 return z;
741 /*----------------------------------------------------------------------------
742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
743 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
744 | and returns the proper extended double-precision floating-point value
745 | corresponding to the abstract input. Ordinarily, the abstract value is
746 | rounded and packed into the extended double-precision format, with the
747 | inexact exception raised if the abstract input cannot be represented
748 | exactly. However, if the abstract value is too large, the overflow and
749 | inexact exceptions are raised and an infinity or maximal finite value is
750 | returned. If the abstract value is too small, the input value is rounded to
751 | a subnormal number, and the underflow and inexact exceptions are raised if
752 | the abstract input cannot be represented exactly as a subnormal extended
753 | double-precision floating-point number.
754 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
755 | number of bits as single or double precision, respectively. Otherwise, the
756 | result is rounded to the full precision of the extended double-precision
757 | format.
758 | The input significand must be normalized or smaller. If the input
759 | significand is not normalized, `zExp' must be 0; in that case, the result
760 | returned is a subnormal number, and it must not require rounding. The
761 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
762 | Floating-Point Arithmetic.
763 *----------------------------------------------------------------------------*/
765 static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
766 int32 zExp, uint64_t zSig0, uint64_t zSig1,
767 float_status *status)
769 int8 roundingMode;
770 flag roundNearestEven, increment, isTiny;
771 int64 roundIncrement, roundMask, roundBits;
773 roundingMode = STATUS(float_rounding_mode);
774 roundNearestEven = ( roundingMode == float_round_nearest_even );
775 if ( roundingPrecision == 80 ) goto precision80;
776 if ( roundingPrecision == 64 ) {
777 roundIncrement = LIT64( 0x0000000000000400 );
778 roundMask = LIT64( 0x00000000000007FF );
780 else if ( roundingPrecision == 32 ) {
781 roundIncrement = LIT64( 0x0000008000000000 );
782 roundMask = LIT64( 0x000000FFFFFFFFFF );
784 else {
785 goto precision80;
787 zSig0 |= ( zSig1 != 0 );
788 switch (roundingMode) {
789 case float_round_nearest_even:
790 case float_round_ties_away:
791 break;
792 case float_round_to_zero:
793 roundIncrement = 0;
794 break;
795 case float_round_up:
796 roundIncrement = zSign ? 0 : roundMask;
797 break;
798 case float_round_down:
799 roundIncrement = zSign ? roundMask : 0;
800 break;
801 default:
802 abort();
804 roundBits = zSig0 & roundMask;
805 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
806 if ( ( 0x7FFE < zExp )
807 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
809 goto overflow;
811 if ( zExp <= 0 ) {
812 if (STATUS(flush_to_zero)) {
813 float_raise(float_flag_output_denormal STATUS_VAR);
814 return packFloatx80(zSign, 0, 0);
816 isTiny =
817 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
818 || ( zExp < 0 )
819 || ( zSig0 <= zSig0 + roundIncrement );
820 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
821 zExp = 0;
822 roundBits = zSig0 & roundMask;
823 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
824 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
825 zSig0 += roundIncrement;
826 if ( (int64_t) zSig0 < 0 ) zExp = 1;
827 roundIncrement = roundMask + 1;
828 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
829 roundMask |= roundIncrement;
831 zSig0 &= ~ roundMask;
832 return packFloatx80( zSign, zExp, zSig0 );
835 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
836 zSig0 += roundIncrement;
837 if ( zSig0 < roundIncrement ) {
838 ++zExp;
839 zSig0 = LIT64( 0x8000000000000000 );
841 roundIncrement = roundMask + 1;
842 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
843 roundMask |= roundIncrement;
845 zSig0 &= ~ roundMask;
846 if ( zSig0 == 0 ) zExp = 0;
847 return packFloatx80( zSign, zExp, zSig0 );
848 precision80:
849 switch (roundingMode) {
850 case float_round_nearest_even:
851 case float_round_ties_away:
852 increment = ((int64_t)zSig1 < 0);
853 break;
854 case float_round_to_zero:
855 increment = 0;
856 break;
857 case float_round_up:
858 increment = !zSign && zSig1;
859 break;
860 case float_round_down:
861 increment = zSign && zSig1;
862 break;
863 default:
864 abort();
866 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
867 if ( ( 0x7FFE < zExp )
868 || ( ( zExp == 0x7FFE )
869 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
870 && increment
873 roundMask = 0;
874 overflow:
875 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
876 if ( ( roundingMode == float_round_to_zero )
877 || ( zSign && ( roundingMode == float_round_up ) )
878 || ( ! zSign && ( roundingMode == float_round_down ) )
880 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
882 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
884 if ( zExp <= 0 ) {
885 isTiny =
886 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
887 || ( zExp < 0 )
888 || ! increment
889 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
890 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
891 zExp = 0;
892 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
893 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
894 switch (roundingMode) {
895 case float_round_nearest_even:
896 case float_round_ties_away:
897 increment = ((int64_t)zSig1 < 0);
898 break;
899 case float_round_to_zero:
900 increment = 0;
901 break;
902 case float_round_up:
903 increment = !zSign && zSig1;
904 break;
905 case float_round_down:
906 increment = zSign && zSig1;
907 break;
908 default:
909 abort();
911 if ( increment ) {
912 ++zSig0;
913 zSig0 &=
914 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
915 if ( (int64_t) zSig0 < 0 ) zExp = 1;
917 return packFloatx80( zSign, zExp, zSig0 );
920 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
921 if ( increment ) {
922 ++zSig0;
923 if ( zSig0 == 0 ) {
924 ++zExp;
925 zSig0 = LIT64( 0x8000000000000000 );
927 else {
928 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
931 else {
932 if ( zSig0 == 0 ) zExp = 0;
934 return packFloatx80( zSign, zExp, zSig0 );
938 /*----------------------------------------------------------------------------
939 | Takes an abstract floating-point value having sign `zSign', exponent
940 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
941 | and returns the proper extended double-precision floating-point value
942 | corresponding to the abstract input. This routine is just like
943 | `roundAndPackFloatx80' except that the input significand does not have to be
944 | normalized.
945 *----------------------------------------------------------------------------*/
947 static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
948 flag zSign, int32 zExp,
949 uint64_t zSig0, uint64_t zSig1,
950 float_status *status)
952 int8 shiftCount;
954 if ( zSig0 == 0 ) {
955 zSig0 = zSig1;
956 zSig1 = 0;
957 zExp -= 64;
959 shiftCount = countLeadingZeros64( zSig0 );
960 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
961 zExp -= shiftCount;
962 return
963 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
967 /*----------------------------------------------------------------------------
968 | Returns the least-significant 64 fraction bits of the quadruple-precision
969 | floating-point value `a'.
970 *----------------------------------------------------------------------------*/
972 static inline uint64_t extractFloat128Frac1( float128 a )
975 return a.low;
979 /*----------------------------------------------------------------------------
980 | Returns the most-significant 48 fraction bits of the quadruple-precision
981 | floating-point value `a'.
982 *----------------------------------------------------------------------------*/
984 static inline uint64_t extractFloat128Frac0( float128 a )
987 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
991 /*----------------------------------------------------------------------------
992 | Returns the exponent bits of the quadruple-precision floating-point value
993 | `a'.
994 *----------------------------------------------------------------------------*/
996 static inline int32 extractFloat128Exp( float128 a )
999 return ( a.high>>48 ) & 0x7FFF;
1003 /*----------------------------------------------------------------------------
1004 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1005 *----------------------------------------------------------------------------*/
1007 static inline flag extractFloat128Sign( float128 a )
1010 return a.high>>63;
1014 /*----------------------------------------------------------------------------
1015 | Normalizes the subnormal quadruple-precision floating-point value
1016 | represented by the denormalized significand formed by the concatenation of
1017 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1018 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1019 | significand are stored at the location pointed to by `zSig0Ptr', and the
1020 | least significant 64 bits of the normalized significand are stored at the
1021 | location pointed to by `zSig1Ptr'.
1022 *----------------------------------------------------------------------------*/
1024 static void
1025 normalizeFloat128Subnormal(
1026 uint64_t aSig0,
1027 uint64_t aSig1,
1028 int32 *zExpPtr,
1029 uint64_t *zSig0Ptr,
1030 uint64_t *zSig1Ptr
1033 int8 shiftCount;
1035 if ( aSig0 == 0 ) {
1036 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1037 if ( shiftCount < 0 ) {
1038 *zSig0Ptr = aSig1>>( - shiftCount );
1039 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1041 else {
1042 *zSig0Ptr = aSig1<<shiftCount;
1043 *zSig1Ptr = 0;
1045 *zExpPtr = - shiftCount - 63;
1047 else {
1048 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1049 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1050 *zExpPtr = 1 - shiftCount;
1055 /*----------------------------------------------------------------------------
1056 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1057 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1058 | floating-point value, returning the result. After being shifted into the
1059 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1060 | added together to form the most significant 32 bits of the result. This
1061 | means that any integer portion of `zSig0' will be added into the exponent.
1062 | Since a properly normalized significand will have an integer portion equal
1063 | to 1, the `zExp' input should be 1 less than the desired result exponent
1064 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1065 | significand.
1066 *----------------------------------------------------------------------------*/
1068 static inline float128
1069 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1071 float128 z;
1073 z.low = zSig1;
1074 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1075 return z;
1079 /*----------------------------------------------------------------------------
1080 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1081 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1082 | and `zSig2', and returns the proper quadruple-precision floating-point value
1083 | corresponding to the abstract input. Ordinarily, the abstract value is
1084 | simply rounded and packed into the quadruple-precision format, with the
1085 | inexact exception raised if the abstract input cannot be represented
1086 | exactly. However, if the abstract value is too large, the overflow and
1087 | inexact exceptions are raised and an infinity or maximal finite value is
1088 | returned. If the abstract value is too small, the input value is rounded to
1089 | a subnormal number, and the underflow and inexact exceptions are raised if
1090 | the abstract input cannot be represented exactly as a subnormal quadruple-
1091 | precision floating-point number.
1092 | The input significand must be normalized or smaller. If the input
1093 | significand is not normalized, `zExp' must be 0; in that case, the result
1094 | returned is a subnormal number, and it must not require rounding. In the
1095 | usual case that the input significand is normalized, `zExp' must be 1 less
1096 | than the ``true'' floating-point exponent. The handling of underflow and
1097 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1098 *----------------------------------------------------------------------------*/
1100 static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1101 uint64_t zSig0, uint64_t zSig1,
1102 uint64_t zSig2, float_status *status)
1104 int8 roundingMode;
1105 flag roundNearestEven, increment, isTiny;
1107 roundingMode = STATUS(float_rounding_mode);
1108 roundNearestEven = ( roundingMode == float_round_nearest_even );
1109 switch (roundingMode) {
1110 case float_round_nearest_even:
1111 case float_round_ties_away:
1112 increment = ((int64_t)zSig2 < 0);
1113 break;
1114 case float_round_to_zero:
1115 increment = 0;
1116 break;
1117 case float_round_up:
1118 increment = !zSign && zSig2;
1119 break;
1120 case float_round_down:
1121 increment = zSign && zSig2;
1122 break;
1123 default:
1124 abort();
1126 if ( 0x7FFD <= (uint32_t) zExp ) {
1127 if ( ( 0x7FFD < zExp )
1128 || ( ( zExp == 0x7FFD )
1129 && eq128(
1130 LIT64( 0x0001FFFFFFFFFFFF ),
1131 LIT64( 0xFFFFFFFFFFFFFFFF ),
1132 zSig0,
1133 zSig1
1135 && increment
1138 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1139 if ( ( roundingMode == float_round_to_zero )
1140 || ( zSign && ( roundingMode == float_round_up ) )
1141 || ( ! zSign && ( roundingMode == float_round_down ) )
1143 return
1144 packFloat128(
1145 zSign,
1146 0x7FFE,
1147 LIT64( 0x0000FFFFFFFFFFFF ),
1148 LIT64( 0xFFFFFFFFFFFFFFFF )
1151 return packFloat128( zSign, 0x7FFF, 0, 0 );
1153 if ( zExp < 0 ) {
1154 if (STATUS(flush_to_zero)) {
1155 float_raise(float_flag_output_denormal STATUS_VAR);
1156 return packFloat128(zSign, 0, 0, 0);
1158 isTiny =
1159 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1160 || ( zExp < -1 )
1161 || ! increment
1162 || lt128(
1163 zSig0,
1164 zSig1,
1165 LIT64( 0x0001FFFFFFFFFFFF ),
1166 LIT64( 0xFFFFFFFFFFFFFFFF )
1168 shift128ExtraRightJamming(
1169 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1170 zExp = 0;
1171 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1172 switch (roundingMode) {
1173 case float_round_nearest_even:
1174 case float_round_ties_away:
1175 increment = ((int64_t)zSig2 < 0);
1176 break;
1177 case float_round_to_zero:
1178 increment = 0;
1179 break;
1180 case float_round_up:
1181 increment = !zSign && zSig2;
1182 break;
1183 case float_round_down:
1184 increment = zSign && zSig2;
1185 break;
1186 default:
1187 abort();
1191 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1192 if ( increment ) {
1193 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1194 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1196 else {
1197 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1199 return packFloat128( zSign, zExp, zSig0, zSig1 );
1203 /*----------------------------------------------------------------------------
1204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1205 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1206 | returns the proper quadruple-precision floating-point value corresponding
1207 | to the abstract input. This routine is just like `roundAndPackFloat128'
1208 | except that the input significand has fewer bits and does not have to be
1209 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1210 | point exponent.
1211 *----------------------------------------------------------------------------*/
1213 static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1214 uint64_t zSig0, uint64_t zSig1,
1215 float_status *status)
1217 int8 shiftCount;
1218 uint64_t zSig2;
1220 if ( zSig0 == 0 ) {
1221 zSig0 = zSig1;
1222 zSig1 = 0;
1223 zExp -= 64;
1225 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1226 if ( 0 <= shiftCount ) {
1227 zSig2 = 0;
1228 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1230 else {
1231 shift128ExtraRightJamming(
1232 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1234 zExp -= shiftCount;
1235 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1239 /*----------------------------------------------------------------------------
1240 | Returns the result of converting the 32-bit two's complement integer `a'
1241 | to the single-precision floating-point format. The conversion is performed
1242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 *----------------------------------------------------------------------------*/
1245 float32 int32_to_float32(int32_t a, float_status *status)
1247 flag zSign;
1249 if ( a == 0 ) return float32_zero;
1250 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1251 zSign = ( a < 0 );
1252 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 32-bit two's complement integer `a'
1258 | to the double-precision floating-point format. The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1262 float64 int32_to_float64(int32_t a, float_status *status)
1264 flag zSign;
1265 uint32 absA;
1266 int8 shiftCount;
1267 uint64_t zSig;
1269 if ( a == 0 ) return float64_zero;
1270 zSign = ( a < 0 );
1271 absA = zSign ? - a : a;
1272 shiftCount = countLeadingZeros32( absA ) + 21;
1273 zSig = absA;
1274 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1278 /*----------------------------------------------------------------------------
1279 | Returns the result of converting the 32-bit two's complement integer `a'
1280 | to the extended double-precision floating-point format. The conversion
1281 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 | Arithmetic.
1283 *----------------------------------------------------------------------------*/
1285 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1287 flag zSign;
1288 uint32 absA;
1289 int8 shiftCount;
1290 uint64_t zSig;
1292 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293 zSign = ( a < 0 );
1294 absA = zSign ? - a : a;
1295 shiftCount = countLeadingZeros32( absA ) + 32;
1296 zSig = absA;
1297 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1301 /*----------------------------------------------------------------------------
1302 | Returns the result of converting the 32-bit two's complement integer `a' to
1303 | the quadruple-precision floating-point format. The conversion is performed
1304 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 *----------------------------------------------------------------------------*/
1307 float128 int32_to_float128(int32_t a, float_status *status)
1309 flag zSign;
1310 uint32 absA;
1311 int8 shiftCount;
1312 uint64_t zSig0;
1314 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1315 zSign = ( a < 0 );
1316 absA = zSign ? - a : a;
1317 shiftCount = countLeadingZeros32( absA ) + 17;
1318 zSig0 = absA;
1319 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the 64-bit two's complement integer `a'
1325 | to the single-precision floating-point format. The conversion is performed
1326 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1327 *----------------------------------------------------------------------------*/
1329 float32 int64_to_float32(int64_t a, float_status *status)
1331 flag zSign;
1332 uint64 absA;
1333 int8 shiftCount;
1335 if ( a == 0 ) return float32_zero;
1336 zSign = ( a < 0 );
1337 absA = zSign ? - a : a;
1338 shiftCount = countLeadingZeros64( absA ) - 40;
1339 if ( 0 <= shiftCount ) {
1340 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1342 else {
1343 shiftCount += 7;
1344 if ( shiftCount < 0 ) {
1345 shift64RightJamming( absA, - shiftCount, &absA );
1347 else {
1348 absA <<= shiftCount;
1350 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1355 /*----------------------------------------------------------------------------
1356 | Returns the result of converting the 64-bit two's complement integer `a'
1357 | to the double-precision floating-point format. The conversion is performed
1358 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1359 *----------------------------------------------------------------------------*/
1361 float64 int64_to_float64(int64_t a, float_status *status)
1363 flag zSign;
1365 if ( a == 0 ) return float64_zero;
1366 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1367 return packFloat64( 1, 0x43E, 0 );
1369 zSign = ( a < 0 );
1370 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1374 /*----------------------------------------------------------------------------
1375 | Returns the result of converting the 64-bit two's complement integer `a'
1376 | to the extended double-precision floating-point format. The conversion
1377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1378 | Arithmetic.
1379 *----------------------------------------------------------------------------*/
1381 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1383 flag zSign;
1384 uint64 absA;
1385 int8 shiftCount;
1387 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1388 zSign = ( a < 0 );
1389 absA = zSign ? - a : a;
1390 shiftCount = countLeadingZeros64( absA );
1391 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1395 /*----------------------------------------------------------------------------
1396 | Returns the result of converting the 64-bit two's complement integer `a' to
1397 | the quadruple-precision floating-point format. The conversion is performed
1398 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1399 *----------------------------------------------------------------------------*/
1401 float128 int64_to_float128(int64_t a, float_status *status)
1403 flag zSign;
1404 uint64 absA;
1405 int8 shiftCount;
1406 int32 zExp;
1407 uint64_t zSig0, zSig1;
1409 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1410 zSign = ( a < 0 );
1411 absA = zSign ? - a : a;
1412 shiftCount = countLeadingZeros64( absA ) + 49;
1413 zExp = 0x406E - shiftCount;
1414 if ( 64 <= shiftCount ) {
1415 zSig1 = 0;
1416 zSig0 = absA;
1417 shiftCount -= 64;
1419 else {
1420 zSig1 = absA;
1421 zSig0 = 0;
1423 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1424 return packFloat128( zSign, zExp, zSig0, zSig1 );
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the 64-bit unsigned integer `a'
1430 | to the single-precision floating-point format. The conversion is performed
1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432 *----------------------------------------------------------------------------*/
1434 float32 uint64_to_float32(uint64_t a, float_status *status)
1436 int shiftcount;
1438 if (a == 0) {
1439 return float32_zero;
1442 /* Determine (left) shift needed to put first set bit into bit posn 23
1443 * (since packFloat32() expects the binary point between bits 23 and 22);
1444 * this is the fast case for smallish numbers.
1446 shiftcount = countLeadingZeros64(a) - 40;
1447 if (shiftcount >= 0) {
1448 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1450 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1451 * expects the binary point between bits 30 and 29, hence the + 7.
1453 shiftcount += 7;
1454 if (shiftcount < 0) {
1455 shift64RightJamming(a, -shiftcount, &a);
1456 } else {
1457 a <<= shiftcount;
1460 return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1463 /*----------------------------------------------------------------------------
1464 | Returns the result of converting the 64-bit unsigned integer `a'
1465 | to the double-precision floating-point format. The conversion is performed
1466 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1467 *----------------------------------------------------------------------------*/
1469 float64 uint64_to_float64(uint64_t a, float_status *status)
1471 int exp = 0x43C;
1472 int shiftcount;
1474 if (a == 0) {
1475 return float64_zero;
1478 shiftcount = countLeadingZeros64(a) - 1;
1479 if (shiftcount < 0) {
1480 shift64RightJamming(a, -shiftcount, &a);
1481 } else {
1482 a <<= shiftcount;
1484 return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1487 /*----------------------------------------------------------------------------
1488 | Returns the result of converting the 64-bit unsigned integer `a'
1489 | to the quadruple-precision floating-point format. The conversion is performed
1490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1491 *----------------------------------------------------------------------------*/
1493 float128 uint64_to_float128(uint64_t a, float_status *status)
1495 if (a == 0) {
1496 return float128_zero;
1498 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1501 /*----------------------------------------------------------------------------
1502 | Returns the result of converting the single-precision floating-point value
1503 | `a' to the 32-bit two's complement integer format. The conversion is
1504 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1505 | Arithmetic---which means in particular that the conversion is rounded
1506 | according to the current rounding mode. If `a' is a NaN, the largest
1507 | positive integer is returned. Otherwise, if the conversion overflows, the
1508 | largest integer with the same sign as `a' is returned.
1509 *----------------------------------------------------------------------------*/
1511 int32 float32_to_int32(float32 a, float_status *status)
1513 flag aSign;
1514 int_fast16_t aExp, shiftCount;
1515 uint32_t aSig;
1516 uint64_t aSig64;
1518 a = float32_squash_input_denormal(a STATUS_VAR);
1519 aSig = extractFloat32Frac( a );
1520 aExp = extractFloat32Exp( a );
1521 aSign = extractFloat32Sign( a );
1522 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1523 if ( aExp ) aSig |= 0x00800000;
1524 shiftCount = 0xAF - aExp;
1525 aSig64 = aSig;
1526 aSig64 <<= 32;
1527 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1528 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1532 /*----------------------------------------------------------------------------
1533 | Returns the result of converting the single-precision floating-point value
1534 | `a' to the 32-bit two's complement integer format. The conversion is
1535 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 | Arithmetic, except that the conversion is always rounded toward zero.
1537 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1538 | the conversion overflows, the largest integer with the same sign as `a' is
1539 | returned.
1540 *----------------------------------------------------------------------------*/
1542 int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
1544 flag aSign;
1545 int_fast16_t aExp, shiftCount;
1546 uint32_t aSig;
1547 int32_t z;
1548 a = float32_squash_input_denormal(a STATUS_VAR);
1550 aSig = extractFloat32Frac( a );
1551 aExp = extractFloat32Exp( a );
1552 aSign = extractFloat32Sign( a );
1553 shiftCount = aExp - 0x9E;
1554 if ( 0 <= shiftCount ) {
1555 if ( float32_val(a) != 0xCF000000 ) {
1556 float_raise( float_flag_invalid STATUS_VAR);
1557 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1559 return (int32_t) 0x80000000;
1561 else if ( aExp <= 0x7E ) {
1562 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1563 return 0;
1565 aSig = ( aSig | 0x00800000 )<<8;
1566 z = aSig>>( - shiftCount );
1567 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1568 STATUS(float_exception_flags) |= float_flag_inexact;
1570 if ( aSign ) z = - z;
1571 return z;
1575 /*----------------------------------------------------------------------------
1576 | Returns the result of converting the single-precision floating-point value
1577 | `a' to the 16-bit two's complement integer format. The conversion is
1578 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1579 | Arithmetic, except that the conversion is always rounded toward zero.
1580 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1581 | the conversion overflows, the largest integer with the same sign as `a' is
1582 | returned.
1583 *----------------------------------------------------------------------------*/
1585 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1587 flag aSign;
1588 int_fast16_t aExp, shiftCount;
1589 uint32_t aSig;
1590 int32 z;
1592 aSig = extractFloat32Frac( a );
1593 aExp = extractFloat32Exp( a );
1594 aSign = extractFloat32Sign( a );
1595 shiftCount = aExp - 0x8E;
1596 if ( 0 <= shiftCount ) {
1597 if ( float32_val(a) != 0xC7000000 ) {
1598 float_raise( float_flag_invalid STATUS_VAR);
1599 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1600 return 0x7FFF;
1603 return (int32_t) 0xffff8000;
1605 else if ( aExp <= 0x7E ) {
1606 if ( aExp | aSig ) {
1607 STATUS(float_exception_flags) |= float_flag_inexact;
1609 return 0;
1611 shiftCount -= 0x10;
1612 aSig = ( aSig | 0x00800000 )<<8;
1613 z = aSig>>( - shiftCount );
1614 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1615 STATUS(float_exception_flags) |= float_flag_inexact;
1617 if ( aSign ) {
1618 z = - z;
1620 return z;
1624 /*----------------------------------------------------------------------------
1625 | Returns the result of converting the single-precision floating-point value
1626 | `a' to the 64-bit two's complement integer format. The conversion is
1627 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1628 | Arithmetic---which means in particular that the conversion is rounded
1629 | according to the current rounding mode. If `a' is a NaN, the largest
1630 | positive integer is returned. Otherwise, if the conversion overflows, the
1631 | largest integer with the same sign as `a' is returned.
1632 *----------------------------------------------------------------------------*/
1634 int64 float32_to_int64(float32 a, float_status *status)
1636 flag aSign;
1637 int_fast16_t aExp, shiftCount;
1638 uint32_t aSig;
1639 uint64_t aSig64, aSigExtra;
1640 a = float32_squash_input_denormal(a STATUS_VAR);
1642 aSig = extractFloat32Frac( a );
1643 aExp = extractFloat32Exp( a );
1644 aSign = extractFloat32Sign( a );
1645 shiftCount = 0xBE - aExp;
1646 if ( shiftCount < 0 ) {
1647 float_raise( float_flag_invalid STATUS_VAR);
1648 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1649 return LIT64( 0x7FFFFFFFFFFFFFFF );
1651 return (int64_t) LIT64( 0x8000000000000000 );
1653 if ( aExp ) aSig |= 0x00800000;
1654 aSig64 = aSig;
1655 aSig64 <<= 40;
1656 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1657 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1661 /*----------------------------------------------------------------------------
1662 | Returns the result of converting the single-precision floating-point value
1663 | `a' to the 64-bit unsigned integer format. The conversion is
1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1665 | Arithmetic---which means in particular that the conversion is rounded
1666 | according to the current rounding mode. If `a' is a NaN, the largest
1667 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1668 | largest unsigned integer is returned. If the 'a' is negative, the result
1669 | is rounded and zero is returned; values that do not round to zero will
1670 | raise the inexact exception flag.
1671 *----------------------------------------------------------------------------*/
1673 uint64 float32_to_uint64(float32 a, float_status *status)
1675 flag aSign;
1676 int_fast16_t aExp, shiftCount;
1677 uint32_t aSig;
1678 uint64_t aSig64, aSigExtra;
1679 a = float32_squash_input_denormal(a STATUS_VAR);
1681 aSig = extractFloat32Frac(a);
1682 aExp = extractFloat32Exp(a);
1683 aSign = extractFloat32Sign(a);
1684 if ((aSign) && (aExp > 126)) {
1685 float_raise(float_flag_invalid STATUS_VAR);
1686 if (float32_is_any_nan(a)) {
1687 return LIT64(0xFFFFFFFFFFFFFFFF);
1688 } else {
1689 return 0;
1692 shiftCount = 0xBE - aExp;
1693 if (aExp) {
1694 aSig |= 0x00800000;
1696 if (shiftCount < 0) {
1697 float_raise(float_flag_invalid STATUS_VAR);
1698 return LIT64(0xFFFFFFFFFFFFFFFF);
1701 aSig64 = aSig;
1702 aSig64 <<= 40;
1703 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1704 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1707 /*----------------------------------------------------------------------------
1708 | Returns the result of converting the single-precision floating-point value
1709 | `a' to the 64-bit unsigned integer format. The conversion is
1710 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1711 | Arithmetic, except that the conversion is always rounded toward zero. If
1712 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1713 | conversion overflows, the largest unsigned integer is returned. If the
1714 | 'a' is negative, the result is rounded and zero is returned; values that do
1715 | not round to zero will raise the inexact flag.
1716 *----------------------------------------------------------------------------*/
1718 uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
1720 signed char current_rounding_mode = STATUS(float_rounding_mode);
1721 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1722 int64_t v = float32_to_uint64(a STATUS_VAR);
1723 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1724 return v;
1727 /*----------------------------------------------------------------------------
1728 | Returns the result of converting the single-precision floating-point value
1729 | `a' to the 64-bit two's complement integer format. The conversion is
1730 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1731 | Arithmetic, except that the conversion is always rounded toward zero. If
1732 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1733 | conversion overflows, the largest integer with the same sign as `a' is
1734 | returned.
1735 *----------------------------------------------------------------------------*/
1737 int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
1739 flag aSign;
1740 int_fast16_t aExp, shiftCount;
1741 uint32_t aSig;
1742 uint64_t aSig64;
1743 int64 z;
1744 a = float32_squash_input_denormal(a STATUS_VAR);
1746 aSig = extractFloat32Frac( a );
1747 aExp = extractFloat32Exp( a );
1748 aSign = extractFloat32Sign( a );
1749 shiftCount = aExp - 0xBE;
1750 if ( 0 <= shiftCount ) {
1751 if ( float32_val(a) != 0xDF000000 ) {
1752 float_raise( float_flag_invalid STATUS_VAR);
1753 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1754 return LIT64( 0x7FFFFFFFFFFFFFFF );
1757 return (int64_t) LIT64( 0x8000000000000000 );
1759 else if ( aExp <= 0x7E ) {
1760 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1761 return 0;
1763 aSig64 = aSig | 0x00800000;
1764 aSig64 <<= 40;
1765 z = aSig64>>( - shiftCount );
1766 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1767 STATUS(float_exception_flags) |= float_flag_inexact;
1769 if ( aSign ) z = - z;
1770 return z;
1774 /*----------------------------------------------------------------------------
1775 | Returns the result of converting the single-precision floating-point value
1776 | `a' to the double-precision floating-point format. The conversion is
1777 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1778 | Arithmetic.
1779 *----------------------------------------------------------------------------*/
1781 float64 float32_to_float64(float32 a, float_status *status)
1783 flag aSign;
1784 int_fast16_t aExp;
1785 uint32_t aSig;
1786 a = float32_squash_input_denormal(a STATUS_VAR);
1788 aSig = extractFloat32Frac( a );
1789 aExp = extractFloat32Exp( a );
1790 aSign = extractFloat32Sign( a );
1791 if ( aExp == 0xFF ) {
1792 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1793 return packFloat64( aSign, 0x7FF, 0 );
1795 if ( aExp == 0 ) {
1796 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1797 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1798 --aExp;
1800 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1804 /*----------------------------------------------------------------------------
1805 | Returns the result of converting the single-precision floating-point value
1806 | `a' to the extended double-precision floating-point format. The conversion
1807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1808 | Arithmetic.
1809 *----------------------------------------------------------------------------*/
1811 floatx80 float32_to_floatx80(float32 a, float_status *status)
1813 flag aSign;
1814 int_fast16_t aExp;
1815 uint32_t aSig;
1817 a = float32_squash_input_denormal(a STATUS_VAR);
1818 aSig = extractFloat32Frac( a );
1819 aExp = extractFloat32Exp( a );
1820 aSign = extractFloat32Sign( a );
1821 if ( aExp == 0xFF ) {
1822 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1823 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1825 if ( aExp == 0 ) {
1826 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1827 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1829 aSig |= 0x00800000;
1830 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1834 /*----------------------------------------------------------------------------
1835 | Returns the result of converting the single-precision floating-point value
1836 | `a' to the double-precision floating-point format. The conversion is
1837 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1838 | Arithmetic.
1839 *----------------------------------------------------------------------------*/
1841 float128 float32_to_float128(float32 a, float_status *status)
1843 flag aSign;
1844 int_fast16_t aExp;
1845 uint32_t aSig;
1847 a = float32_squash_input_denormal(a STATUS_VAR);
1848 aSig = extractFloat32Frac( a );
1849 aExp = extractFloat32Exp( a );
1850 aSign = extractFloat32Sign( a );
1851 if ( aExp == 0xFF ) {
1852 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1853 return packFloat128( aSign, 0x7FFF, 0, 0 );
1855 if ( aExp == 0 ) {
1856 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1857 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1858 --aExp;
1860 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1864 /*----------------------------------------------------------------------------
1865 | Rounds the single-precision floating-point value `a' to an integer, and
1866 | returns the result as a single-precision floating-point value. The
1867 | operation is performed according to the IEC/IEEE Standard for Binary
1868 | Floating-Point Arithmetic.
1869 *----------------------------------------------------------------------------*/
1871 float32 float32_round_to_int(float32 a, float_status *status)
1873 flag aSign;
1874 int_fast16_t aExp;
1875 uint32_t lastBitMask, roundBitsMask;
1876 uint32_t z;
1877 a = float32_squash_input_denormal(a STATUS_VAR);
1879 aExp = extractFloat32Exp( a );
1880 if ( 0x96 <= aExp ) {
1881 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1882 return propagateFloat32NaN( a, a STATUS_VAR );
1884 return a;
1886 if ( aExp <= 0x7E ) {
1887 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1888 STATUS(float_exception_flags) |= float_flag_inexact;
1889 aSign = extractFloat32Sign( a );
1890 switch ( STATUS(float_rounding_mode) ) {
1891 case float_round_nearest_even:
1892 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1893 return packFloat32( aSign, 0x7F, 0 );
1895 break;
1896 case float_round_ties_away:
1897 if (aExp == 0x7E) {
1898 return packFloat32(aSign, 0x7F, 0);
1900 break;
1901 case float_round_down:
1902 return make_float32(aSign ? 0xBF800000 : 0);
1903 case float_round_up:
1904 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1906 return packFloat32( aSign, 0, 0 );
1908 lastBitMask = 1;
1909 lastBitMask <<= 0x96 - aExp;
1910 roundBitsMask = lastBitMask - 1;
1911 z = float32_val(a);
1912 switch (STATUS(float_rounding_mode)) {
1913 case float_round_nearest_even:
1914 z += lastBitMask>>1;
1915 if ((z & roundBitsMask) == 0) {
1916 z &= ~lastBitMask;
1918 break;
1919 case float_round_ties_away:
1920 z += lastBitMask >> 1;
1921 break;
1922 case float_round_to_zero:
1923 break;
1924 case float_round_up:
1925 if (!extractFloat32Sign(make_float32(z))) {
1926 z += roundBitsMask;
1928 break;
1929 case float_round_down:
1930 if (extractFloat32Sign(make_float32(z))) {
1931 z += roundBitsMask;
1933 break;
1934 default:
1935 abort();
1937 z &= ~ roundBitsMask;
1938 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1939 return make_float32(z);
1943 /*----------------------------------------------------------------------------
1944 | Returns the result of adding the absolute values of the single-precision
1945 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1946 | before being returned. `zSign' is ignored if the result is a NaN.
1947 | The addition is performed according to the IEC/IEEE Standard for Binary
1948 | Floating-Point Arithmetic.
1949 *----------------------------------------------------------------------------*/
1951 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1952 float_status *status)
1954 int_fast16_t aExp, bExp, zExp;
1955 uint32_t aSig, bSig, zSig;
1956 int_fast16_t expDiff;
1958 aSig = extractFloat32Frac( a );
1959 aExp = extractFloat32Exp( a );
1960 bSig = extractFloat32Frac( b );
1961 bExp = extractFloat32Exp( b );
1962 expDiff = aExp - bExp;
1963 aSig <<= 6;
1964 bSig <<= 6;
1965 if ( 0 < expDiff ) {
1966 if ( aExp == 0xFF ) {
1967 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1968 return a;
1970 if ( bExp == 0 ) {
1971 --expDiff;
1973 else {
1974 bSig |= 0x20000000;
1976 shift32RightJamming( bSig, expDiff, &bSig );
1977 zExp = aExp;
1979 else if ( expDiff < 0 ) {
1980 if ( bExp == 0xFF ) {
1981 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982 return packFloat32( zSign, 0xFF, 0 );
1984 if ( aExp == 0 ) {
1985 ++expDiff;
1987 else {
1988 aSig |= 0x20000000;
1990 shift32RightJamming( aSig, - expDiff, &aSig );
1991 zExp = bExp;
1993 else {
1994 if ( aExp == 0xFF ) {
1995 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1996 return a;
1998 if ( aExp == 0 ) {
1999 if (STATUS(flush_to_zero)) {
2000 if (aSig | bSig) {
2001 float_raise(float_flag_output_denormal STATUS_VAR);
2003 return packFloat32(zSign, 0, 0);
2005 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2007 zSig = 0x40000000 + aSig + bSig;
2008 zExp = aExp;
2009 goto roundAndPack;
2011 aSig |= 0x20000000;
2012 zSig = ( aSig + bSig )<<1;
2013 --zExp;
2014 if ( (int32_t) zSig < 0 ) {
2015 zSig = aSig + bSig;
2016 ++zExp;
2018 roundAndPack:
2019 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023 /*----------------------------------------------------------------------------
2024 | Returns the result of subtracting the absolute values of the single-
2025 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2026 | difference is negated before being returned. `zSign' is ignored if the
2027 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2028 | Standard for Binary Floating-Point Arithmetic.
2029 *----------------------------------------------------------------------------*/
2031 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2032 float_status *status)
2034 int_fast16_t aExp, bExp, zExp;
2035 uint32_t aSig, bSig, zSig;
2036 int_fast16_t expDiff;
2038 aSig = extractFloat32Frac( a );
2039 aExp = extractFloat32Exp( a );
2040 bSig = extractFloat32Frac( b );
2041 bExp = extractFloat32Exp( b );
2042 expDiff = aExp - bExp;
2043 aSig <<= 7;
2044 bSig <<= 7;
2045 if ( 0 < expDiff ) goto aExpBigger;
2046 if ( expDiff < 0 ) goto bExpBigger;
2047 if ( aExp == 0xFF ) {
2048 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2049 float_raise( float_flag_invalid STATUS_VAR);
2050 return float32_default_nan;
2052 if ( aExp == 0 ) {
2053 aExp = 1;
2054 bExp = 1;
2056 if ( bSig < aSig ) goto aBigger;
2057 if ( aSig < bSig ) goto bBigger;
2058 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2059 bExpBigger:
2060 if ( bExp == 0xFF ) {
2061 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2062 return packFloat32( zSign ^ 1, 0xFF, 0 );
2064 if ( aExp == 0 ) {
2065 ++expDiff;
2067 else {
2068 aSig |= 0x40000000;
2070 shift32RightJamming( aSig, - expDiff, &aSig );
2071 bSig |= 0x40000000;
2072 bBigger:
2073 zSig = bSig - aSig;
2074 zExp = bExp;
2075 zSign ^= 1;
2076 goto normalizeRoundAndPack;
2077 aExpBigger:
2078 if ( aExp == 0xFF ) {
2079 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2080 return a;
2082 if ( bExp == 0 ) {
2083 --expDiff;
2085 else {
2086 bSig |= 0x40000000;
2088 shift32RightJamming( bSig, expDiff, &bSig );
2089 aSig |= 0x40000000;
2090 aBigger:
2091 zSig = aSig - bSig;
2092 zExp = aExp;
2093 normalizeRoundAndPack:
2094 --zExp;
2095 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2099 /*----------------------------------------------------------------------------
2100 | Returns the result of adding the single-precision floating-point values `a'
2101 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2102 | Binary Floating-Point Arithmetic.
2103 *----------------------------------------------------------------------------*/
2105 float32 float32_add(float32 a, float32 b, float_status *status)
2107 flag aSign, bSign;
2108 a = float32_squash_input_denormal(a STATUS_VAR);
2109 b = float32_squash_input_denormal(b STATUS_VAR);
2111 aSign = extractFloat32Sign( a );
2112 bSign = extractFloat32Sign( b );
2113 if ( aSign == bSign ) {
2114 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2116 else {
2117 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2122 /*----------------------------------------------------------------------------
2123 | Returns the result of subtracting the single-precision floating-point values
2124 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2125 | for Binary Floating-Point Arithmetic.
2126 *----------------------------------------------------------------------------*/
2128 float32 float32_sub(float32 a, float32 b, float_status *status)
2130 flag aSign, bSign;
2131 a = float32_squash_input_denormal(a STATUS_VAR);
2132 b = float32_squash_input_denormal(b STATUS_VAR);
2134 aSign = extractFloat32Sign( a );
2135 bSign = extractFloat32Sign( b );
2136 if ( aSign == bSign ) {
2137 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2139 else {
2140 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2145 /*----------------------------------------------------------------------------
2146 | Returns the result of multiplying the single-precision floating-point values
2147 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2148 | for Binary Floating-Point Arithmetic.
2149 *----------------------------------------------------------------------------*/
2151 float32 float32_mul(float32 a, float32 b, float_status *status)
2153 flag aSign, bSign, zSign;
2154 int_fast16_t aExp, bExp, zExp;
2155 uint32_t aSig, bSig;
2156 uint64_t zSig64;
2157 uint32_t zSig;
2159 a = float32_squash_input_denormal(a STATUS_VAR);
2160 b = float32_squash_input_denormal(b STATUS_VAR);
2162 aSig = extractFloat32Frac( a );
2163 aExp = extractFloat32Exp( a );
2164 aSign = extractFloat32Sign( a );
2165 bSig = extractFloat32Frac( b );
2166 bExp = extractFloat32Exp( b );
2167 bSign = extractFloat32Sign( b );
2168 zSign = aSign ^ bSign;
2169 if ( aExp == 0xFF ) {
2170 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2171 return propagateFloat32NaN( a, b STATUS_VAR );
2173 if ( ( bExp | bSig ) == 0 ) {
2174 float_raise( float_flag_invalid STATUS_VAR);
2175 return float32_default_nan;
2177 return packFloat32( zSign, 0xFF, 0 );
2179 if ( bExp == 0xFF ) {
2180 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2181 if ( ( aExp | aSig ) == 0 ) {
2182 float_raise( float_flag_invalid STATUS_VAR);
2183 return float32_default_nan;
2185 return packFloat32( zSign, 0xFF, 0 );
2187 if ( aExp == 0 ) {
2188 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2189 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2191 if ( bExp == 0 ) {
2192 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2193 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2195 zExp = aExp + bExp - 0x7F;
2196 aSig = ( aSig | 0x00800000 )<<7;
2197 bSig = ( bSig | 0x00800000 )<<8;
2198 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2199 zSig = zSig64;
2200 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2201 zSig <<= 1;
2202 --zExp;
2204 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2208 /*----------------------------------------------------------------------------
2209 | Returns the result of dividing the single-precision floating-point value `a'
2210 | by the corresponding value `b'. The operation is performed according to the
2211 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2212 *----------------------------------------------------------------------------*/
2214 float32 float32_div(float32 a, float32 b, float_status *status)
2216 flag aSign, bSign, zSign;
2217 int_fast16_t aExp, bExp, zExp;
2218 uint32_t aSig, bSig, zSig;
2219 a = float32_squash_input_denormal(a STATUS_VAR);
2220 b = float32_squash_input_denormal(b STATUS_VAR);
2222 aSig = extractFloat32Frac( a );
2223 aExp = extractFloat32Exp( a );
2224 aSign = extractFloat32Sign( a );
2225 bSig = extractFloat32Frac( b );
2226 bExp = extractFloat32Exp( b );
2227 bSign = extractFloat32Sign( b );
2228 zSign = aSign ^ bSign;
2229 if ( aExp == 0xFF ) {
2230 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2231 if ( bExp == 0xFF ) {
2232 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2233 float_raise( float_flag_invalid STATUS_VAR);
2234 return float32_default_nan;
2236 return packFloat32( zSign, 0xFF, 0 );
2238 if ( bExp == 0xFF ) {
2239 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2240 return packFloat32( zSign, 0, 0 );
2242 if ( bExp == 0 ) {
2243 if ( bSig == 0 ) {
2244 if ( ( aExp | aSig ) == 0 ) {
2245 float_raise( float_flag_invalid STATUS_VAR);
2246 return float32_default_nan;
2248 float_raise( float_flag_divbyzero STATUS_VAR);
2249 return packFloat32( zSign, 0xFF, 0 );
2251 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2253 if ( aExp == 0 ) {
2254 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2255 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2257 zExp = aExp - bExp + 0x7D;
2258 aSig = ( aSig | 0x00800000 )<<7;
2259 bSig = ( bSig | 0x00800000 )<<8;
2260 if ( bSig <= ( aSig + aSig ) ) {
2261 aSig >>= 1;
2262 ++zExp;
2264 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2265 if ( ( zSig & 0x3F ) == 0 ) {
2266 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2268 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2272 /*----------------------------------------------------------------------------
2273 | Returns the remainder of the single-precision floating-point value `a'
2274 | with respect to the corresponding value `b'. The operation is performed
2275 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 *----------------------------------------------------------------------------*/
2278 float32 float32_rem(float32 a, float32 b, float_status *status)
2280 flag aSign, zSign;
2281 int_fast16_t aExp, bExp, expDiff;
2282 uint32_t aSig, bSig;
2283 uint32_t q;
2284 uint64_t aSig64, bSig64, q64;
2285 uint32_t alternateASig;
2286 int32_t sigMean;
2287 a = float32_squash_input_denormal(a STATUS_VAR);
2288 b = float32_squash_input_denormal(b STATUS_VAR);
2290 aSig = extractFloat32Frac( a );
2291 aExp = extractFloat32Exp( a );
2292 aSign = extractFloat32Sign( a );
2293 bSig = extractFloat32Frac( b );
2294 bExp = extractFloat32Exp( b );
2295 if ( aExp == 0xFF ) {
2296 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2297 return propagateFloat32NaN( a, b STATUS_VAR );
2299 float_raise( float_flag_invalid STATUS_VAR);
2300 return float32_default_nan;
2302 if ( bExp == 0xFF ) {
2303 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2304 return a;
2306 if ( bExp == 0 ) {
2307 if ( bSig == 0 ) {
2308 float_raise( float_flag_invalid STATUS_VAR);
2309 return float32_default_nan;
2311 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2313 if ( aExp == 0 ) {
2314 if ( aSig == 0 ) return a;
2315 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2317 expDiff = aExp - bExp;
2318 aSig |= 0x00800000;
2319 bSig |= 0x00800000;
2320 if ( expDiff < 32 ) {
2321 aSig <<= 8;
2322 bSig <<= 8;
2323 if ( expDiff < 0 ) {
2324 if ( expDiff < -1 ) return a;
2325 aSig >>= 1;
2327 q = ( bSig <= aSig );
2328 if ( q ) aSig -= bSig;
2329 if ( 0 < expDiff ) {
2330 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2331 q >>= 32 - expDiff;
2332 bSig >>= 2;
2333 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2335 else {
2336 aSig >>= 2;
2337 bSig >>= 2;
2340 else {
2341 if ( bSig <= aSig ) aSig -= bSig;
2342 aSig64 = ( (uint64_t) aSig )<<40;
2343 bSig64 = ( (uint64_t) bSig )<<40;
2344 expDiff -= 64;
2345 while ( 0 < expDiff ) {
2346 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2347 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2348 aSig64 = - ( ( bSig * q64 )<<38 );
2349 expDiff -= 62;
2351 expDiff += 64;
2352 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2353 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2354 q = q64>>( 64 - expDiff );
2355 bSig <<= 6;
2356 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2358 do {
2359 alternateASig = aSig;
2360 ++q;
2361 aSig -= bSig;
2362 } while ( 0 <= (int32_t) aSig );
2363 sigMean = aSig + alternateASig;
2364 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2365 aSig = alternateASig;
2367 zSign = ( (int32_t) aSig < 0 );
2368 if ( zSign ) aSig = - aSig;
2369 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2373 /*----------------------------------------------------------------------------
2374 | Returns the result of multiplying the single-precision floating-point values
2375 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2376 | multiplication. The operation is performed according to the IEC/IEEE
2377 | Standard for Binary Floating-Point Arithmetic 754-2008.
2378 | The flags argument allows the caller to select negation of the
2379 | addend, the intermediate product, or the final result. (The difference
2380 | between this and having the caller do a separate negation is that negating
2381 | externally will flip the sign bit on NaNs.)
2382 *----------------------------------------------------------------------------*/
2384 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2385 float_status *status)
2387 flag aSign, bSign, cSign, zSign;
2388 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2389 uint32_t aSig, bSig, cSig;
2390 flag pInf, pZero, pSign;
2391 uint64_t pSig64, cSig64, zSig64;
2392 uint32_t pSig;
2393 int shiftcount;
2394 flag signflip, infzero;
2396 a = float32_squash_input_denormal(a STATUS_VAR);
2397 b = float32_squash_input_denormal(b STATUS_VAR);
2398 c = float32_squash_input_denormal(c STATUS_VAR);
2399 aSig = extractFloat32Frac(a);
2400 aExp = extractFloat32Exp(a);
2401 aSign = extractFloat32Sign(a);
2402 bSig = extractFloat32Frac(b);
2403 bExp = extractFloat32Exp(b);
2404 bSign = extractFloat32Sign(b);
2405 cSig = extractFloat32Frac(c);
2406 cExp = extractFloat32Exp(c);
2407 cSign = extractFloat32Sign(c);
2409 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2410 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2412 /* It is implementation-defined whether the cases of (0,inf,qnan)
2413 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2414 * they return if they do), so we have to hand this information
2415 * off to the target-specific pick-a-NaN routine.
2417 if (((aExp == 0xff) && aSig) ||
2418 ((bExp == 0xff) && bSig) ||
2419 ((cExp == 0xff) && cSig)) {
2420 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2423 if (infzero) {
2424 float_raise(float_flag_invalid STATUS_VAR);
2425 return float32_default_nan;
2428 if (flags & float_muladd_negate_c) {
2429 cSign ^= 1;
2432 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2434 /* Work out the sign and type of the product */
2435 pSign = aSign ^ bSign;
2436 if (flags & float_muladd_negate_product) {
2437 pSign ^= 1;
2439 pInf = (aExp == 0xff) || (bExp == 0xff);
2440 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2442 if (cExp == 0xff) {
2443 if (pInf && (pSign ^ cSign)) {
2444 /* addition of opposite-signed infinities => InvalidOperation */
2445 float_raise(float_flag_invalid STATUS_VAR);
2446 return float32_default_nan;
2448 /* Otherwise generate an infinity of the same sign */
2449 return packFloat32(cSign ^ signflip, 0xff, 0);
2452 if (pInf) {
2453 return packFloat32(pSign ^ signflip, 0xff, 0);
2456 if (pZero) {
2457 if (cExp == 0) {
2458 if (cSig == 0) {
2459 /* Adding two exact zeroes */
2460 if (pSign == cSign) {
2461 zSign = pSign;
2462 } else if (STATUS(float_rounding_mode) == float_round_down) {
2463 zSign = 1;
2464 } else {
2465 zSign = 0;
2467 return packFloat32(zSign ^ signflip, 0, 0);
2469 /* Exact zero plus a denorm */
2470 if (STATUS(flush_to_zero)) {
2471 float_raise(float_flag_output_denormal STATUS_VAR);
2472 return packFloat32(cSign ^ signflip, 0, 0);
2475 /* Zero plus something non-zero : just return the something */
2476 if (flags & float_muladd_halve_result) {
2477 if (cExp == 0) {
2478 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2480 /* Subtract one to halve, and one again because roundAndPackFloat32
2481 * wants one less than the true exponent.
2483 cExp -= 2;
2484 cSig = (cSig | 0x00800000) << 7;
2485 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2487 return packFloat32(cSign ^ signflip, cExp, cSig);
2490 if (aExp == 0) {
2491 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2493 if (bExp == 0) {
2494 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2497 /* Calculate the actual result a * b + c */
2499 /* Multiply first; this is easy. */
2500 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2501 * because we want the true exponent, not the "one-less-than"
2502 * flavour that roundAndPackFloat32() takes.
2504 pExp = aExp + bExp - 0x7e;
2505 aSig = (aSig | 0x00800000) << 7;
2506 bSig = (bSig | 0x00800000) << 8;
2507 pSig64 = (uint64_t)aSig * bSig;
2508 if ((int64_t)(pSig64 << 1) >= 0) {
2509 pSig64 <<= 1;
2510 pExp--;
2513 zSign = pSign ^ signflip;
2515 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2516 * position 62.
2518 if (cExp == 0) {
2519 if (!cSig) {
2520 /* Throw out the special case of c being an exact zero now */
2521 shift64RightJamming(pSig64, 32, &pSig64);
2522 pSig = pSig64;
2523 if (flags & float_muladd_halve_result) {
2524 pExp--;
2526 return roundAndPackFloat32(zSign, pExp - 1,
2527 pSig STATUS_VAR);
2529 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2532 cSig64 = (uint64_t)cSig << (62 - 23);
2533 cSig64 |= LIT64(0x4000000000000000);
2534 expDiff = pExp - cExp;
2536 if (pSign == cSign) {
2537 /* Addition */
2538 if (expDiff > 0) {
2539 /* scale c to match p */
2540 shift64RightJamming(cSig64, expDiff, &cSig64);
2541 zExp = pExp;
2542 } else if (expDiff < 0) {
2543 /* scale p to match c */
2544 shift64RightJamming(pSig64, -expDiff, &pSig64);
2545 zExp = cExp;
2546 } else {
2547 /* no scaling needed */
2548 zExp = cExp;
2550 /* Add significands and make sure explicit bit ends up in posn 62 */
2551 zSig64 = pSig64 + cSig64;
2552 if ((int64_t)zSig64 < 0) {
2553 shift64RightJamming(zSig64, 1, &zSig64);
2554 } else {
2555 zExp--;
2557 } else {
2558 /* Subtraction */
2559 if (expDiff > 0) {
2560 shift64RightJamming(cSig64, expDiff, &cSig64);
2561 zSig64 = pSig64 - cSig64;
2562 zExp = pExp;
2563 } else if (expDiff < 0) {
2564 shift64RightJamming(pSig64, -expDiff, &pSig64);
2565 zSig64 = cSig64 - pSig64;
2566 zExp = cExp;
2567 zSign ^= 1;
2568 } else {
2569 zExp = pExp;
2570 if (cSig64 < pSig64) {
2571 zSig64 = pSig64 - cSig64;
2572 } else if (pSig64 < cSig64) {
2573 zSig64 = cSig64 - pSig64;
2574 zSign ^= 1;
2575 } else {
2576 /* Exact zero */
2577 zSign = signflip;
2578 if (STATUS(float_rounding_mode) == float_round_down) {
2579 zSign ^= 1;
2581 return packFloat32(zSign, 0, 0);
2584 --zExp;
2585 /* Normalize to put the explicit bit back into bit 62. */
2586 shiftcount = countLeadingZeros64(zSig64) - 1;
2587 zSig64 <<= shiftcount;
2588 zExp -= shiftcount;
2590 if (flags & float_muladd_halve_result) {
2591 zExp--;
2594 shift64RightJamming(zSig64, 32, &zSig64);
2595 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2599 /*----------------------------------------------------------------------------
2600 | Returns the square root of the single-precision floating-point value `a'.
2601 | The operation is performed according to the IEC/IEEE Standard for Binary
2602 | Floating-Point Arithmetic.
2603 *----------------------------------------------------------------------------*/
2605 float32 float32_sqrt(float32 a, float_status *status)
2607 flag aSign;
2608 int_fast16_t aExp, zExp;
2609 uint32_t aSig, zSig;
2610 uint64_t rem, term;
2611 a = float32_squash_input_denormal(a STATUS_VAR);
2613 aSig = extractFloat32Frac( a );
2614 aExp = extractFloat32Exp( a );
2615 aSign = extractFloat32Sign( a );
2616 if ( aExp == 0xFF ) {
2617 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2618 if ( ! aSign ) return a;
2619 float_raise( float_flag_invalid STATUS_VAR);
2620 return float32_default_nan;
2622 if ( aSign ) {
2623 if ( ( aExp | aSig ) == 0 ) return a;
2624 float_raise( float_flag_invalid STATUS_VAR);
2625 return float32_default_nan;
2627 if ( aExp == 0 ) {
2628 if ( aSig == 0 ) return float32_zero;
2629 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2631 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2632 aSig = ( aSig | 0x00800000 )<<8;
2633 zSig = estimateSqrt32( aExp, aSig ) + 2;
2634 if ( ( zSig & 0x7F ) <= 5 ) {
2635 if ( zSig < 2 ) {
2636 zSig = 0x7FFFFFFF;
2637 goto roundAndPack;
2639 aSig >>= aExp & 1;
2640 term = ( (uint64_t) zSig ) * zSig;
2641 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2642 while ( (int64_t) rem < 0 ) {
2643 --zSig;
2644 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2646 zSig |= ( rem != 0 );
2648 shift32RightJamming( zSig, 1, &zSig );
2649 roundAndPack:
2650 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2654 /*----------------------------------------------------------------------------
2655 | Returns the binary exponential of the single-precision floating-point value
2656 | `a'. The operation is performed according to the IEC/IEEE Standard for
2657 | Binary Floating-Point Arithmetic.
2659 | Uses the following identities:
2661 | 1. -------------------------------------------------------------------------
2662 | x x*ln(2)
2663 | 2 = e
2665 | 2. -------------------------------------------------------------------------
2666 | 2 3 4 5 n
2667 | x x x x x x x
2668 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2669 | 1! 2! 3! 4! 5! n!
2670 *----------------------------------------------------------------------------*/
2672 static const float64 float32_exp2_coefficients[15] =
2674 const_float64( 0x3ff0000000000000ll ), /* 1 */
2675 const_float64( 0x3fe0000000000000ll ), /* 2 */
2676 const_float64( 0x3fc5555555555555ll ), /* 3 */
2677 const_float64( 0x3fa5555555555555ll ), /* 4 */
2678 const_float64( 0x3f81111111111111ll ), /* 5 */
2679 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2680 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2681 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2682 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2683 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2684 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2685 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2686 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2687 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2688 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2691 float32 float32_exp2(float32 a, float_status *status)
2693 flag aSign;
2694 int_fast16_t aExp;
2695 uint32_t aSig;
2696 float64 r, x, xn;
2697 int i;
2698 a = float32_squash_input_denormal(a STATUS_VAR);
2700 aSig = extractFloat32Frac( a );
2701 aExp = extractFloat32Exp( a );
2702 aSign = extractFloat32Sign( a );
2704 if ( aExp == 0xFF) {
2705 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2706 return (aSign) ? float32_zero : a;
2708 if (aExp == 0) {
2709 if (aSig == 0) return float32_one;
2712 float_raise( float_flag_inexact STATUS_VAR);
2714 /* ******************************* */
2715 /* using float64 for approximation */
2716 /* ******************************* */
2717 x = float32_to_float64(a STATUS_VAR);
2718 x = float64_mul(x, float64_ln2 STATUS_VAR);
2720 xn = x;
2721 r = float64_one;
2722 for (i = 0 ; i < 15 ; i++) {
2723 float64 f;
2725 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2726 r = float64_add(r, f STATUS_VAR);
2728 xn = float64_mul(xn, x STATUS_VAR);
2731 return float64_to_float32(r, status);
2734 /*----------------------------------------------------------------------------
2735 | Returns the binary log of the single-precision floating-point value `a'.
2736 | The operation is performed according to the IEC/IEEE Standard for Binary
2737 | Floating-Point Arithmetic.
2738 *----------------------------------------------------------------------------*/
2739 float32 float32_log2(float32 a, float_status *status)
2741 flag aSign, zSign;
2742 int_fast16_t aExp;
2743 uint32_t aSig, zSig, i;
2745 a = float32_squash_input_denormal(a STATUS_VAR);
2746 aSig = extractFloat32Frac( a );
2747 aExp = extractFloat32Exp( a );
2748 aSign = extractFloat32Sign( a );
2750 if ( aExp == 0 ) {
2751 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2752 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2754 if ( aSign ) {
2755 float_raise( float_flag_invalid STATUS_VAR);
2756 return float32_default_nan;
2758 if ( aExp == 0xFF ) {
2759 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2760 return a;
2763 aExp -= 0x7F;
2764 aSig |= 0x00800000;
2765 zSign = aExp < 0;
2766 zSig = aExp << 23;
2768 for (i = 1 << 22; i > 0; i >>= 1) {
2769 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2770 if ( aSig & 0x01000000 ) {
2771 aSig >>= 1;
2772 zSig |= i;
2776 if ( zSign )
2777 zSig = -zSig;
2779 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2782 /*----------------------------------------------------------------------------
2783 | Returns 1 if the single-precision floating-point value `a' is equal to
2784 | the corresponding value `b', and 0 otherwise. The invalid exception is
2785 | raised if either operand is a NaN. Otherwise, the comparison is performed
2786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2787 *----------------------------------------------------------------------------*/
2789 int float32_eq(float32 a, float32 b, float_status *status)
2791 uint32_t av, bv;
2792 a = float32_squash_input_denormal(a STATUS_VAR);
2793 b = float32_squash_input_denormal(b STATUS_VAR);
2795 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2796 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2798 float_raise( float_flag_invalid STATUS_VAR);
2799 return 0;
2801 av = float32_val(a);
2802 bv = float32_val(b);
2803 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2806 /*----------------------------------------------------------------------------
2807 | Returns 1 if the single-precision floating-point value `a' is less than
2808 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2809 | exception is raised if either operand is a NaN. The comparison is performed
2810 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2811 *----------------------------------------------------------------------------*/
2813 int float32_le(float32 a, float32 b, float_status *status)
2815 flag aSign, bSign;
2816 uint32_t av, bv;
2817 a = float32_squash_input_denormal(a STATUS_VAR);
2818 b = float32_squash_input_denormal(b STATUS_VAR);
2820 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2821 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2823 float_raise( float_flag_invalid STATUS_VAR);
2824 return 0;
2826 aSign = extractFloat32Sign( a );
2827 bSign = extractFloat32Sign( b );
2828 av = float32_val(a);
2829 bv = float32_val(b);
2830 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2831 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2835 /*----------------------------------------------------------------------------
2836 | Returns 1 if the single-precision floating-point value `a' is less than
2837 | the corresponding value `b', and 0 otherwise. The invalid exception is
2838 | raised if either operand is a NaN. The comparison is performed according
2839 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2840 *----------------------------------------------------------------------------*/
2842 int float32_lt(float32 a, float32 b, float_status *status)
2844 flag aSign, bSign;
2845 uint32_t av, bv;
2846 a = float32_squash_input_denormal(a STATUS_VAR);
2847 b = float32_squash_input_denormal(b STATUS_VAR);
2849 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2850 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2852 float_raise( float_flag_invalid STATUS_VAR);
2853 return 0;
2855 aSign = extractFloat32Sign( a );
2856 bSign = extractFloat32Sign( b );
2857 av = float32_val(a);
2858 bv = float32_val(b);
2859 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2860 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2864 /*----------------------------------------------------------------------------
2865 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2866 | be compared, and 0 otherwise. The invalid exception is raised if either
2867 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2868 | Standard for Binary Floating-Point Arithmetic.
2869 *----------------------------------------------------------------------------*/
2871 int float32_unordered(float32 a, float32 b, float_status *status)
2873 a = float32_squash_input_denormal(a STATUS_VAR);
2874 b = float32_squash_input_denormal(b STATUS_VAR);
2876 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2877 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2879 float_raise( float_flag_invalid STATUS_VAR);
2880 return 1;
2882 return 0;
2885 /*----------------------------------------------------------------------------
2886 | Returns 1 if the single-precision floating-point value `a' is equal to
2887 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2888 | exception. The comparison is performed according to the IEC/IEEE Standard
2889 | for Binary Floating-Point Arithmetic.
2890 *----------------------------------------------------------------------------*/
2892 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2894 a = float32_squash_input_denormal(a STATUS_VAR);
2895 b = float32_squash_input_denormal(b STATUS_VAR);
2897 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2898 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2900 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2901 float_raise( float_flag_invalid STATUS_VAR);
2903 return 0;
2905 return ( float32_val(a) == float32_val(b) ) ||
2906 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2909 /*----------------------------------------------------------------------------
2910 | Returns 1 if the single-precision floating-point value `a' is less than or
2911 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2912 | cause an exception. Otherwise, the comparison is performed according to the
2913 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2914 *----------------------------------------------------------------------------*/
2916 int float32_le_quiet(float32 a, float32 b, float_status *status)
2918 flag aSign, bSign;
2919 uint32_t av, bv;
2920 a = float32_squash_input_denormal(a STATUS_VAR);
2921 b = float32_squash_input_denormal(b STATUS_VAR);
2923 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2924 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2926 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2927 float_raise( float_flag_invalid STATUS_VAR);
2929 return 0;
2931 aSign = extractFloat32Sign( a );
2932 bSign = extractFloat32Sign( b );
2933 av = float32_val(a);
2934 bv = float32_val(b);
2935 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2936 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2940 /*----------------------------------------------------------------------------
2941 | Returns 1 if the single-precision floating-point value `a' is less than
2942 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2943 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2944 | Standard for Binary Floating-Point Arithmetic.
2945 *----------------------------------------------------------------------------*/
2947 int float32_lt_quiet(float32 a, float32 b, float_status *status)
2949 flag aSign, bSign;
2950 uint32_t av, bv;
2951 a = float32_squash_input_denormal(a STATUS_VAR);
2952 b = float32_squash_input_denormal(b STATUS_VAR);
2954 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2955 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2957 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2958 float_raise( float_flag_invalid STATUS_VAR);
2960 return 0;
2962 aSign = extractFloat32Sign( a );
2963 bSign = extractFloat32Sign( b );
2964 av = float32_val(a);
2965 bv = float32_val(b);
2966 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2967 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2971 /*----------------------------------------------------------------------------
2972 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2973 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2974 | comparison is performed according to the IEC/IEEE Standard for Binary
2975 | Floating-Point Arithmetic.
2976 *----------------------------------------------------------------------------*/
2978 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
2980 a = float32_squash_input_denormal(a STATUS_VAR);
2981 b = float32_squash_input_denormal(b STATUS_VAR);
2983 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2984 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2986 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2987 float_raise( float_flag_invalid STATUS_VAR);
2989 return 1;
2991 return 0;
2994 /*----------------------------------------------------------------------------
2995 | Returns the result of converting the double-precision floating-point value
2996 | `a' to the 32-bit two's complement integer format. The conversion is
2997 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2998 | Arithmetic---which means in particular that the conversion is rounded
2999 | according to the current rounding mode. If `a' is a NaN, the largest
3000 | positive integer is returned. Otherwise, if the conversion overflows, the
3001 | largest integer with the same sign as `a' is returned.
3002 *----------------------------------------------------------------------------*/
3004 int32 float64_to_int32(float64 a, float_status *status)
3006 flag aSign;
3007 int_fast16_t aExp, shiftCount;
3008 uint64_t aSig;
3009 a = float64_squash_input_denormal(a STATUS_VAR);
3011 aSig = extractFloat64Frac( a );
3012 aExp = extractFloat64Exp( a );
3013 aSign = extractFloat64Sign( a );
3014 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3015 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3016 shiftCount = 0x42C - aExp;
3017 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3018 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3022 /*----------------------------------------------------------------------------
3023 | Returns the result of converting the double-precision floating-point value
3024 | `a' to the 32-bit two's complement integer format. The conversion is
3025 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3026 | Arithmetic, except that the conversion is always rounded toward zero.
3027 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3028 | the conversion overflows, the largest integer with the same sign as `a' is
3029 | returned.
3030 *----------------------------------------------------------------------------*/
3032 int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
3034 flag aSign;
3035 int_fast16_t aExp, shiftCount;
3036 uint64_t aSig, savedASig;
3037 int32_t z;
3038 a = float64_squash_input_denormal(a STATUS_VAR);
3040 aSig = extractFloat64Frac( a );
3041 aExp = extractFloat64Exp( a );
3042 aSign = extractFloat64Sign( a );
3043 if ( 0x41E < aExp ) {
3044 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3045 goto invalid;
3047 else if ( aExp < 0x3FF ) {
3048 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3049 return 0;
3051 aSig |= LIT64( 0x0010000000000000 );
3052 shiftCount = 0x433 - aExp;
3053 savedASig = aSig;
3054 aSig >>= shiftCount;
3055 z = aSig;
3056 if ( aSign ) z = - z;
3057 if ( ( z < 0 ) ^ aSign ) {
3058 invalid:
3059 float_raise( float_flag_invalid STATUS_VAR);
3060 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3062 if ( ( aSig<<shiftCount ) != savedASig ) {
3063 STATUS(float_exception_flags) |= float_flag_inexact;
3065 return z;
3069 /*----------------------------------------------------------------------------
3070 | Returns the result of converting the double-precision floating-point value
3071 | `a' to the 16-bit two's complement integer format. The conversion is
3072 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3073 | Arithmetic, except that the conversion is always rounded toward zero.
3074 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3075 | the conversion overflows, the largest integer with the same sign as `a' is
3076 | returned.
3077 *----------------------------------------------------------------------------*/
3079 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3081 flag aSign;
3082 int_fast16_t aExp, shiftCount;
3083 uint64_t aSig, savedASig;
3084 int32 z;
3086 aSig = extractFloat64Frac( a );
3087 aExp = extractFloat64Exp( a );
3088 aSign = extractFloat64Sign( a );
3089 if ( 0x40E < aExp ) {
3090 if ( ( aExp == 0x7FF ) && aSig ) {
3091 aSign = 0;
3093 goto invalid;
3095 else if ( aExp < 0x3FF ) {
3096 if ( aExp || aSig ) {
3097 STATUS(float_exception_flags) |= float_flag_inexact;
3099 return 0;
3101 aSig |= LIT64( 0x0010000000000000 );
3102 shiftCount = 0x433 - aExp;
3103 savedASig = aSig;
3104 aSig >>= shiftCount;
3105 z = aSig;
3106 if ( aSign ) {
3107 z = - z;
3109 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3110 invalid:
3111 float_raise( float_flag_invalid STATUS_VAR);
3112 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3114 if ( ( aSig<<shiftCount ) != savedASig ) {
3115 STATUS(float_exception_flags) |= float_flag_inexact;
3117 return z;
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the double-precision floating-point value
3122 | `a' to the 64-bit two's complement integer format. The conversion is
3123 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3124 | Arithmetic---which means in particular that the conversion is rounded
3125 | according to the current rounding mode. If `a' is a NaN, the largest
3126 | positive integer is returned. Otherwise, if the conversion overflows, the
3127 | largest integer with the same sign as `a' is returned.
3128 *----------------------------------------------------------------------------*/
3130 int64 float64_to_int64(float64 a, float_status *status)
3132 flag aSign;
3133 int_fast16_t aExp, shiftCount;
3134 uint64_t aSig, aSigExtra;
3135 a = float64_squash_input_denormal(a STATUS_VAR);
3137 aSig = extractFloat64Frac( a );
3138 aExp = extractFloat64Exp( a );
3139 aSign = extractFloat64Sign( a );
3140 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3141 shiftCount = 0x433 - aExp;
3142 if ( shiftCount <= 0 ) {
3143 if ( 0x43E < aExp ) {
3144 float_raise( float_flag_invalid STATUS_VAR);
3145 if ( ! aSign
3146 || ( ( aExp == 0x7FF )
3147 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3149 return LIT64( 0x7FFFFFFFFFFFFFFF );
3151 return (int64_t) LIT64( 0x8000000000000000 );
3153 aSigExtra = 0;
3154 aSig <<= - shiftCount;
3156 else {
3157 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3159 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3163 /*----------------------------------------------------------------------------
3164 | Returns the result of converting the double-precision floating-point value
3165 | `a' to the 64-bit two's complement integer format. The conversion is
3166 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3167 | Arithmetic, except that the conversion is always rounded toward zero.
3168 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3169 | the conversion overflows, the largest integer with the same sign as `a' is
3170 | returned.
3171 *----------------------------------------------------------------------------*/
3173 int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
3175 flag aSign;
3176 int_fast16_t aExp, shiftCount;
3177 uint64_t aSig;
3178 int64 z;
3179 a = float64_squash_input_denormal(a STATUS_VAR);
3181 aSig = extractFloat64Frac( a );
3182 aExp = extractFloat64Exp( a );
3183 aSign = extractFloat64Sign( a );
3184 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3185 shiftCount = aExp - 0x433;
3186 if ( 0 <= shiftCount ) {
3187 if ( 0x43E <= aExp ) {
3188 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3189 float_raise( float_flag_invalid STATUS_VAR);
3190 if ( ! aSign
3191 || ( ( aExp == 0x7FF )
3192 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3194 return LIT64( 0x7FFFFFFFFFFFFFFF );
3197 return (int64_t) LIT64( 0x8000000000000000 );
3199 z = aSig<<shiftCount;
3201 else {
3202 if ( aExp < 0x3FE ) {
3203 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3204 return 0;
3206 z = aSig>>( - shiftCount );
3207 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3208 STATUS(float_exception_flags) |= float_flag_inexact;
3211 if ( aSign ) z = - z;
3212 return z;
3216 /*----------------------------------------------------------------------------
3217 | Returns the result of converting the double-precision floating-point value
3218 | `a' to the single-precision floating-point format. The conversion is
3219 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3220 | Arithmetic.
3221 *----------------------------------------------------------------------------*/
3223 float32 float64_to_float32(float64 a, float_status *status)
3225 flag aSign;
3226 int_fast16_t aExp;
3227 uint64_t aSig;
3228 uint32_t zSig;
3229 a = float64_squash_input_denormal(a STATUS_VAR);
3231 aSig = extractFloat64Frac( a );
3232 aExp = extractFloat64Exp( a );
3233 aSign = extractFloat64Sign( a );
3234 if ( aExp == 0x7FF ) {
3235 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3236 return packFloat32( aSign, 0xFF, 0 );
3238 shift64RightJamming( aSig, 22, &aSig );
3239 zSig = aSig;
3240 if ( aExp || zSig ) {
3241 zSig |= 0x40000000;
3242 aExp -= 0x381;
3244 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3249 /*----------------------------------------------------------------------------
3250 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3251 | half-precision floating-point value, returning the result. After being
3252 | shifted into the proper positions, the three fields are simply added
3253 | together to form the result. This means that any integer portion of `zSig'
3254 | will be added into the exponent. Since a properly normalized significand
3255 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3256 | than the desired result exponent whenever `zSig' is a complete, normalized
3257 | significand.
3258 *----------------------------------------------------------------------------*/
3259 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3261 return make_float16(
3262 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3265 /*----------------------------------------------------------------------------
3266 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3267 | and significand `zSig', and returns the proper half-precision floating-
3268 | point value corresponding to the abstract input. Ordinarily, the abstract
3269 | value is simply rounded and packed into the half-precision format, with
3270 | the inexact exception raised if the abstract input cannot be represented
3271 | exactly. However, if the abstract value is too large, the overflow and
3272 | inexact exceptions are raised and an infinity or maximal finite value is
3273 | returned. If the abstract value is too small, the input value is rounded to
3274 | a subnormal number, and the underflow and inexact exceptions are raised if
3275 | the abstract input cannot be represented exactly as a subnormal half-
3276 | precision floating-point number.
3277 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3278 | ARM-style "alternative representation", which omits the NaN and Inf
3279 | encodings in order to raise the maximum representable exponent by one.
3280 | The input significand `zSig' has its binary point between bits 22
3281 | and 23, which is 13 bits to the left of the usual location. This shifted
3282 | significand must be normalized or smaller. If `zSig' is not normalized,
3283 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3284 | and it must not require rounding. In the usual case that `zSig' is
3285 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3286 | Note the slightly odd position of the binary point in zSig compared with the
3287 | other roundAndPackFloat functions. This should probably be fixed if we
3288 | need to implement more float16 routines than just conversion.
3289 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3290 | Binary Floating-Point Arithmetic.
3291 *----------------------------------------------------------------------------*/
3293 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3294 uint32_t zSig, flag ieee,
3295 float_status *status)
3297 int maxexp = ieee ? 29 : 30;
3298 uint32_t mask;
3299 uint32_t increment;
3300 bool rounding_bumps_exp;
3301 bool is_tiny = false;
3303 /* Calculate the mask of bits of the mantissa which are not
3304 * representable in half-precision and will be lost.
3306 if (zExp < 1) {
3307 /* Will be denormal in halfprec */
3308 mask = 0x00ffffff;
3309 if (zExp >= -11) {
3310 mask >>= 11 + zExp;
3312 } else {
3313 /* Normal number in halfprec */
3314 mask = 0x00001fff;
3317 switch (STATUS(float_rounding_mode)) {
3318 case float_round_nearest_even:
3319 increment = (mask + 1) >> 1;
3320 if ((zSig & mask) == increment) {
3321 increment = zSig & (increment << 1);
3323 break;
3324 case float_round_ties_away:
3325 increment = (mask + 1) >> 1;
3326 break;
3327 case float_round_up:
3328 increment = zSign ? 0 : mask;
3329 break;
3330 case float_round_down:
3331 increment = zSign ? mask : 0;
3332 break;
3333 default: /* round_to_zero */
3334 increment = 0;
3335 break;
3338 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3340 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3341 if (ieee) {
3342 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3343 return packFloat16(zSign, 0x1f, 0);
3344 } else {
3345 float_raise(float_flag_invalid STATUS_VAR);
3346 return packFloat16(zSign, 0x1f, 0x3ff);
3350 if (zExp < 0) {
3351 /* Note that flush-to-zero does not affect half-precision results */
3352 is_tiny =
3353 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3354 || (zExp < -1)
3355 || (!rounding_bumps_exp);
3357 if (zSig & mask) {
3358 float_raise(float_flag_inexact STATUS_VAR);
3359 if (is_tiny) {
3360 float_raise(float_flag_underflow STATUS_VAR);
3364 zSig += increment;
3365 if (rounding_bumps_exp) {
3366 zSig >>= 1;
3367 zExp++;
3370 if (zExp < -10) {
3371 return packFloat16(zSign, 0, 0);
3373 if (zExp < 0) {
3374 zSig >>= -zExp;
3375 zExp = 0;
3377 return packFloat16(zSign, zExp, zSig >> 13);
3380 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3381 uint32_t *zSigPtr)
3383 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3384 *zSigPtr = aSig << shiftCount;
3385 *zExpPtr = 1 - shiftCount;
3388 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3389 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3391 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3393 flag aSign;
3394 int_fast16_t aExp;
3395 uint32_t aSig;
3397 aSign = extractFloat16Sign(a);
3398 aExp = extractFloat16Exp(a);
3399 aSig = extractFloat16Frac(a);
3401 if (aExp == 0x1f && ieee) {
3402 if (aSig) {
3403 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3405 return packFloat32(aSign, 0xff, 0);
3407 if (aExp == 0) {
3408 if (aSig == 0) {
3409 return packFloat32(aSign, 0, 0);
3412 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3413 aExp--;
3415 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3418 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3420 flag aSign;
3421 int_fast16_t aExp;
3422 uint32_t aSig;
3424 a = float32_squash_input_denormal(a STATUS_VAR);
3426 aSig = extractFloat32Frac( a );
3427 aExp = extractFloat32Exp( a );
3428 aSign = extractFloat32Sign( a );
3429 if ( aExp == 0xFF ) {
3430 if (aSig) {
3431 /* Input is a NaN */
3432 if (!ieee) {
3433 float_raise(float_flag_invalid STATUS_VAR);
3434 return packFloat16(aSign, 0, 0);
3436 return commonNaNToFloat16(
3437 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3439 /* Infinity */
3440 if (!ieee) {
3441 float_raise(float_flag_invalid STATUS_VAR);
3442 return packFloat16(aSign, 0x1f, 0x3ff);
3444 return packFloat16(aSign, 0x1f, 0);
3446 if (aExp == 0 && aSig == 0) {
3447 return packFloat16(aSign, 0, 0);
3449 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3450 * even if the input is denormal; however this is harmless because
3451 * the largest possible single-precision denormal is still smaller
3452 * than the smallest representable half-precision denormal, and so we
3453 * will end up ignoring aSig and returning via the "always return zero"
3454 * codepath.
3456 aSig |= 0x00800000;
3457 aExp -= 0x71;
3459 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3462 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3464 flag aSign;
3465 int_fast16_t aExp;
3466 uint32_t aSig;
3468 aSign = extractFloat16Sign(a);
3469 aExp = extractFloat16Exp(a);
3470 aSig = extractFloat16Frac(a);
3472 if (aExp == 0x1f && ieee) {
3473 if (aSig) {
3474 return commonNaNToFloat64(
3475 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3477 return packFloat64(aSign, 0x7ff, 0);
3479 if (aExp == 0) {
3480 if (aSig == 0) {
3481 return packFloat64(aSign, 0, 0);
3484 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3485 aExp--;
3487 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3490 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3492 flag aSign;
3493 int_fast16_t aExp;
3494 uint64_t aSig;
3495 uint32_t zSig;
3497 a = float64_squash_input_denormal(a STATUS_VAR);
3499 aSig = extractFloat64Frac(a);
3500 aExp = extractFloat64Exp(a);
3501 aSign = extractFloat64Sign(a);
3502 if (aExp == 0x7FF) {
3503 if (aSig) {
3504 /* Input is a NaN */
3505 if (!ieee) {
3506 float_raise(float_flag_invalid STATUS_VAR);
3507 return packFloat16(aSign, 0, 0);
3509 return commonNaNToFloat16(
3510 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3512 /* Infinity */
3513 if (!ieee) {
3514 float_raise(float_flag_invalid STATUS_VAR);
3515 return packFloat16(aSign, 0x1f, 0x3ff);
3517 return packFloat16(aSign, 0x1f, 0);
3519 shift64RightJamming(aSig, 29, &aSig);
3520 zSig = aSig;
3521 if (aExp == 0 && zSig == 0) {
3522 return packFloat16(aSign, 0, 0);
3524 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3525 * even if the input is denormal; however this is harmless because
3526 * the largest possible single-precision denormal is still smaller
3527 * than the smallest representable half-precision denormal, and so we
3528 * will end up ignoring aSig and returning via the "always return zero"
3529 * codepath.
3531 zSig |= 0x00800000;
3532 aExp -= 0x3F1;
3534 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3537 /*----------------------------------------------------------------------------
3538 | Returns the result of converting the double-precision floating-point value
3539 | `a' to the extended double-precision floating-point format. The conversion
3540 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3541 | Arithmetic.
3542 *----------------------------------------------------------------------------*/
3544 floatx80 float64_to_floatx80(float64 a, float_status *status)
3546 flag aSign;
3547 int_fast16_t aExp;
3548 uint64_t aSig;
3550 a = float64_squash_input_denormal(a STATUS_VAR);
3551 aSig = extractFloat64Frac( a );
3552 aExp = extractFloat64Exp( a );
3553 aSign = extractFloat64Sign( a );
3554 if ( aExp == 0x7FF ) {
3555 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3556 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3558 if ( aExp == 0 ) {
3559 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3560 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3562 return
3563 packFloatx80(
3564 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3568 /*----------------------------------------------------------------------------
3569 | Returns the result of converting the double-precision floating-point value
3570 | `a' to the quadruple-precision floating-point format. The conversion is
3571 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3572 | Arithmetic.
3573 *----------------------------------------------------------------------------*/
3575 float128 float64_to_float128(float64 a, float_status *status)
3577 flag aSign;
3578 int_fast16_t aExp;
3579 uint64_t aSig, zSig0, zSig1;
3581 a = float64_squash_input_denormal(a STATUS_VAR);
3582 aSig = extractFloat64Frac( a );
3583 aExp = extractFloat64Exp( a );
3584 aSign = extractFloat64Sign( a );
3585 if ( aExp == 0x7FF ) {
3586 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3587 return packFloat128( aSign, 0x7FFF, 0, 0 );
3589 if ( aExp == 0 ) {
3590 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3591 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3592 --aExp;
3594 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3595 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3599 /*----------------------------------------------------------------------------
3600 | Rounds the double-precision floating-point value `a' to an integer, and
3601 | returns the result as a double-precision floating-point value. The
3602 | operation is performed according to the IEC/IEEE Standard for Binary
3603 | Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3606 float64 float64_round_to_int(float64 a, float_status *status)
3608 flag aSign;
3609 int_fast16_t aExp;
3610 uint64_t lastBitMask, roundBitsMask;
3611 uint64_t z;
3612 a = float64_squash_input_denormal(a STATUS_VAR);
3614 aExp = extractFloat64Exp( a );
3615 if ( 0x433 <= aExp ) {
3616 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3617 return propagateFloat64NaN( a, a STATUS_VAR );
3619 return a;
3621 if ( aExp < 0x3FF ) {
3622 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3623 STATUS(float_exception_flags) |= float_flag_inexact;
3624 aSign = extractFloat64Sign( a );
3625 switch ( STATUS(float_rounding_mode) ) {
3626 case float_round_nearest_even:
3627 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3628 return packFloat64( aSign, 0x3FF, 0 );
3630 break;
3631 case float_round_ties_away:
3632 if (aExp == 0x3FE) {
3633 return packFloat64(aSign, 0x3ff, 0);
3635 break;
3636 case float_round_down:
3637 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3638 case float_round_up:
3639 return make_float64(
3640 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3642 return packFloat64( aSign, 0, 0 );
3644 lastBitMask = 1;
3645 lastBitMask <<= 0x433 - aExp;
3646 roundBitsMask = lastBitMask - 1;
3647 z = float64_val(a);
3648 switch (STATUS(float_rounding_mode)) {
3649 case float_round_nearest_even:
3650 z += lastBitMask >> 1;
3651 if ((z & roundBitsMask) == 0) {
3652 z &= ~lastBitMask;
3654 break;
3655 case float_round_ties_away:
3656 z += lastBitMask >> 1;
3657 break;
3658 case float_round_to_zero:
3659 break;
3660 case float_round_up:
3661 if (!extractFloat64Sign(make_float64(z))) {
3662 z += roundBitsMask;
3664 break;
3665 case float_round_down:
3666 if (extractFloat64Sign(make_float64(z))) {
3667 z += roundBitsMask;
3669 break;
3670 default:
3671 abort();
3673 z &= ~ roundBitsMask;
3674 if ( z != float64_val(a) )
3675 STATUS(float_exception_flags) |= float_flag_inexact;
3676 return make_float64(z);
3680 float64 float64_trunc_to_int(float64 a, float_status *status)
3682 int oldmode;
3683 float64 res;
3684 oldmode = STATUS(float_rounding_mode);
3685 STATUS(float_rounding_mode) = float_round_to_zero;
3686 res = float64_round_to_int(a STATUS_VAR);
3687 STATUS(float_rounding_mode) = oldmode;
3688 return res;
3691 /*----------------------------------------------------------------------------
3692 | Returns the result of adding the absolute values of the double-precision
3693 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3694 | before being returned. `zSign' is ignored if the result is a NaN.
3695 | The addition is performed according to the IEC/IEEE Standard for Binary
3696 | Floating-Point Arithmetic.
3697 *----------------------------------------------------------------------------*/
3699 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3700 float_status *status)
3702 int_fast16_t aExp, bExp, zExp;
3703 uint64_t aSig, bSig, zSig;
3704 int_fast16_t expDiff;
3706 aSig = extractFloat64Frac( a );
3707 aExp = extractFloat64Exp( a );
3708 bSig = extractFloat64Frac( b );
3709 bExp = extractFloat64Exp( b );
3710 expDiff = aExp - bExp;
3711 aSig <<= 9;
3712 bSig <<= 9;
3713 if ( 0 < expDiff ) {
3714 if ( aExp == 0x7FF ) {
3715 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3716 return a;
3718 if ( bExp == 0 ) {
3719 --expDiff;
3721 else {
3722 bSig |= LIT64( 0x2000000000000000 );
3724 shift64RightJamming( bSig, expDiff, &bSig );
3725 zExp = aExp;
3727 else if ( expDiff < 0 ) {
3728 if ( bExp == 0x7FF ) {
3729 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3730 return packFloat64( zSign, 0x7FF, 0 );
3732 if ( aExp == 0 ) {
3733 ++expDiff;
3735 else {
3736 aSig |= LIT64( 0x2000000000000000 );
3738 shift64RightJamming( aSig, - expDiff, &aSig );
3739 zExp = bExp;
3741 else {
3742 if ( aExp == 0x7FF ) {
3743 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3744 return a;
3746 if ( aExp == 0 ) {
3747 if (STATUS(flush_to_zero)) {
3748 if (aSig | bSig) {
3749 float_raise(float_flag_output_denormal STATUS_VAR);
3751 return packFloat64(zSign, 0, 0);
3753 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3755 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3756 zExp = aExp;
3757 goto roundAndPack;
3759 aSig |= LIT64( 0x2000000000000000 );
3760 zSig = ( aSig + bSig )<<1;
3761 --zExp;
3762 if ( (int64_t) zSig < 0 ) {
3763 zSig = aSig + bSig;
3764 ++zExp;
3766 roundAndPack:
3767 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3771 /*----------------------------------------------------------------------------
3772 | Returns the result of subtracting the absolute values of the double-
3773 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3774 | difference is negated before being returned. `zSign' is ignored if the
3775 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3776 | Standard for Binary Floating-Point Arithmetic.
3777 *----------------------------------------------------------------------------*/
3779 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3780 float_status *status)
3782 int_fast16_t aExp, bExp, zExp;
3783 uint64_t aSig, bSig, zSig;
3784 int_fast16_t expDiff;
3786 aSig = extractFloat64Frac( a );
3787 aExp = extractFloat64Exp( a );
3788 bSig = extractFloat64Frac( b );
3789 bExp = extractFloat64Exp( b );
3790 expDiff = aExp - bExp;
3791 aSig <<= 10;
3792 bSig <<= 10;
3793 if ( 0 < expDiff ) goto aExpBigger;
3794 if ( expDiff < 0 ) goto bExpBigger;
3795 if ( aExp == 0x7FF ) {
3796 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3797 float_raise( float_flag_invalid STATUS_VAR);
3798 return float64_default_nan;
3800 if ( aExp == 0 ) {
3801 aExp = 1;
3802 bExp = 1;
3804 if ( bSig < aSig ) goto aBigger;
3805 if ( aSig < bSig ) goto bBigger;
3806 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3807 bExpBigger:
3808 if ( bExp == 0x7FF ) {
3809 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3810 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3812 if ( aExp == 0 ) {
3813 ++expDiff;
3815 else {
3816 aSig |= LIT64( 0x4000000000000000 );
3818 shift64RightJamming( aSig, - expDiff, &aSig );
3819 bSig |= LIT64( 0x4000000000000000 );
3820 bBigger:
3821 zSig = bSig - aSig;
3822 zExp = bExp;
3823 zSign ^= 1;
3824 goto normalizeRoundAndPack;
3825 aExpBigger:
3826 if ( aExp == 0x7FF ) {
3827 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3828 return a;
3830 if ( bExp == 0 ) {
3831 --expDiff;
3833 else {
3834 bSig |= LIT64( 0x4000000000000000 );
3836 shift64RightJamming( bSig, expDiff, &bSig );
3837 aSig |= LIT64( 0x4000000000000000 );
3838 aBigger:
3839 zSig = aSig - bSig;
3840 zExp = aExp;
3841 normalizeRoundAndPack:
3842 --zExp;
3843 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3847 /*----------------------------------------------------------------------------
3848 | Returns the result of adding the double-precision floating-point values `a'
3849 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3850 | Binary Floating-Point Arithmetic.
3851 *----------------------------------------------------------------------------*/
3853 float64 float64_add(float64 a, float64 b, float_status *status)
3855 flag aSign, bSign;
3856 a = float64_squash_input_denormal(a STATUS_VAR);
3857 b = float64_squash_input_denormal(b STATUS_VAR);
3859 aSign = extractFloat64Sign( a );
3860 bSign = extractFloat64Sign( b );
3861 if ( aSign == bSign ) {
3862 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3864 else {
3865 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3870 /*----------------------------------------------------------------------------
3871 | Returns the result of subtracting the double-precision floating-point values
3872 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3873 | for Binary Floating-Point Arithmetic.
3874 *----------------------------------------------------------------------------*/
3876 float64 float64_sub(float64 a, float64 b, float_status *status)
3878 flag aSign, bSign;
3879 a = float64_squash_input_denormal(a STATUS_VAR);
3880 b = float64_squash_input_denormal(b STATUS_VAR);
3882 aSign = extractFloat64Sign( a );
3883 bSign = extractFloat64Sign( b );
3884 if ( aSign == bSign ) {
3885 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3887 else {
3888 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of multiplying the double-precision floating-point values
3895 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3896 | for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3899 float64 float64_mul(float64 a, float64 b, float_status *status)
3901 flag aSign, bSign, zSign;
3902 int_fast16_t aExp, bExp, zExp;
3903 uint64_t aSig, bSig, zSig0, zSig1;
3905 a = float64_squash_input_denormal(a STATUS_VAR);
3906 b = float64_squash_input_denormal(b STATUS_VAR);
3908 aSig = extractFloat64Frac( a );
3909 aExp = extractFloat64Exp( a );
3910 aSign = extractFloat64Sign( a );
3911 bSig = extractFloat64Frac( b );
3912 bExp = extractFloat64Exp( b );
3913 bSign = extractFloat64Sign( b );
3914 zSign = aSign ^ bSign;
3915 if ( aExp == 0x7FF ) {
3916 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3917 return propagateFloat64NaN( a, b STATUS_VAR );
3919 if ( ( bExp | bSig ) == 0 ) {
3920 float_raise( float_flag_invalid STATUS_VAR);
3921 return float64_default_nan;
3923 return packFloat64( zSign, 0x7FF, 0 );
3925 if ( bExp == 0x7FF ) {
3926 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3927 if ( ( aExp | aSig ) == 0 ) {
3928 float_raise( float_flag_invalid STATUS_VAR);
3929 return float64_default_nan;
3931 return packFloat64( zSign, 0x7FF, 0 );
3933 if ( aExp == 0 ) {
3934 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3935 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3937 if ( bExp == 0 ) {
3938 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3939 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3941 zExp = aExp + bExp - 0x3FF;
3942 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3943 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3944 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3945 zSig0 |= ( zSig1 != 0 );
3946 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3947 zSig0 <<= 1;
3948 --zExp;
3950 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3954 /*----------------------------------------------------------------------------
3955 | Returns the result of dividing the double-precision floating-point value `a'
3956 | by the corresponding value `b'. The operation is performed according to
3957 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3958 *----------------------------------------------------------------------------*/
3960 float64 float64_div(float64 a, float64 b, float_status *status)
3962 flag aSign, bSign, zSign;
3963 int_fast16_t aExp, bExp, zExp;
3964 uint64_t aSig, bSig, zSig;
3965 uint64_t rem0, rem1;
3966 uint64_t term0, term1;
3967 a = float64_squash_input_denormal(a STATUS_VAR);
3968 b = float64_squash_input_denormal(b STATUS_VAR);
3970 aSig = extractFloat64Frac( a );
3971 aExp = extractFloat64Exp( a );
3972 aSign = extractFloat64Sign( a );
3973 bSig = extractFloat64Frac( b );
3974 bExp = extractFloat64Exp( b );
3975 bSign = extractFloat64Sign( b );
3976 zSign = aSign ^ bSign;
3977 if ( aExp == 0x7FF ) {
3978 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3979 if ( bExp == 0x7FF ) {
3980 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3981 float_raise( float_flag_invalid STATUS_VAR);
3982 return float64_default_nan;
3984 return packFloat64( zSign, 0x7FF, 0 );
3986 if ( bExp == 0x7FF ) {
3987 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3988 return packFloat64( zSign, 0, 0 );
3990 if ( bExp == 0 ) {
3991 if ( bSig == 0 ) {
3992 if ( ( aExp | aSig ) == 0 ) {
3993 float_raise( float_flag_invalid STATUS_VAR);
3994 return float64_default_nan;
3996 float_raise( float_flag_divbyzero STATUS_VAR);
3997 return packFloat64( zSign, 0x7FF, 0 );
3999 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4001 if ( aExp == 0 ) {
4002 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4003 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4005 zExp = aExp - bExp + 0x3FD;
4006 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4007 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4008 if ( bSig <= ( aSig + aSig ) ) {
4009 aSig >>= 1;
4010 ++zExp;
4012 zSig = estimateDiv128To64( aSig, 0, bSig );
4013 if ( ( zSig & 0x1FF ) <= 2 ) {
4014 mul64To128( bSig, zSig, &term0, &term1 );
4015 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4016 while ( (int64_t) rem0 < 0 ) {
4017 --zSig;
4018 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4020 zSig |= ( rem1 != 0 );
4022 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
4026 /*----------------------------------------------------------------------------
4027 | Returns the remainder of the double-precision floating-point value `a'
4028 | with respect to the corresponding value `b'. The operation is performed
4029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4030 *----------------------------------------------------------------------------*/
4032 float64 float64_rem(float64 a, float64 b, float_status *status)
4034 flag aSign, zSign;
4035 int_fast16_t aExp, bExp, expDiff;
4036 uint64_t aSig, bSig;
4037 uint64_t q, alternateASig;
4038 int64_t sigMean;
4040 a = float64_squash_input_denormal(a STATUS_VAR);
4041 b = float64_squash_input_denormal(b STATUS_VAR);
4042 aSig = extractFloat64Frac( a );
4043 aExp = extractFloat64Exp( a );
4044 aSign = extractFloat64Sign( a );
4045 bSig = extractFloat64Frac( b );
4046 bExp = extractFloat64Exp( b );
4047 if ( aExp == 0x7FF ) {
4048 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4049 return propagateFloat64NaN( a, b STATUS_VAR );
4051 float_raise( float_flag_invalid STATUS_VAR);
4052 return float64_default_nan;
4054 if ( bExp == 0x7FF ) {
4055 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4056 return a;
4058 if ( bExp == 0 ) {
4059 if ( bSig == 0 ) {
4060 float_raise( float_flag_invalid STATUS_VAR);
4061 return float64_default_nan;
4063 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4065 if ( aExp == 0 ) {
4066 if ( aSig == 0 ) return a;
4067 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4069 expDiff = aExp - bExp;
4070 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4071 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4072 if ( expDiff < 0 ) {
4073 if ( expDiff < -1 ) return a;
4074 aSig >>= 1;
4076 q = ( bSig <= aSig );
4077 if ( q ) aSig -= bSig;
4078 expDiff -= 64;
4079 while ( 0 < expDiff ) {
4080 q = estimateDiv128To64( aSig, 0, bSig );
4081 q = ( 2 < q ) ? q - 2 : 0;
4082 aSig = - ( ( bSig>>2 ) * q );
4083 expDiff -= 62;
4085 expDiff += 64;
4086 if ( 0 < expDiff ) {
4087 q = estimateDiv128To64( aSig, 0, bSig );
4088 q = ( 2 < q ) ? q - 2 : 0;
4089 q >>= 64 - expDiff;
4090 bSig >>= 2;
4091 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4093 else {
4094 aSig >>= 2;
4095 bSig >>= 2;
4097 do {
4098 alternateASig = aSig;
4099 ++q;
4100 aSig -= bSig;
4101 } while ( 0 <= (int64_t) aSig );
4102 sigMean = aSig + alternateASig;
4103 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4104 aSig = alternateASig;
4106 zSign = ( (int64_t) aSig < 0 );
4107 if ( zSign ) aSig = - aSig;
4108 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4112 /*----------------------------------------------------------------------------
4113 | Returns the result of multiplying the double-precision floating-point values
4114 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4115 | multiplication. The operation is performed according to the IEC/IEEE
4116 | Standard for Binary Floating-Point Arithmetic 754-2008.
4117 | The flags argument allows the caller to select negation of the
4118 | addend, the intermediate product, or the final result. (The difference
4119 | between this and having the caller do a separate negation is that negating
4120 | externally will flip the sign bit on NaNs.)
4121 *----------------------------------------------------------------------------*/
4123 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4124 float_status *status)
4126 flag aSign, bSign, cSign, zSign;
4127 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4128 uint64_t aSig, bSig, cSig;
4129 flag pInf, pZero, pSign;
4130 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4131 int shiftcount;
4132 flag signflip, infzero;
4134 a = float64_squash_input_denormal(a STATUS_VAR);
4135 b = float64_squash_input_denormal(b STATUS_VAR);
4136 c = float64_squash_input_denormal(c STATUS_VAR);
4137 aSig = extractFloat64Frac(a);
4138 aExp = extractFloat64Exp(a);
4139 aSign = extractFloat64Sign(a);
4140 bSig = extractFloat64Frac(b);
4141 bExp = extractFloat64Exp(b);
4142 bSign = extractFloat64Sign(b);
4143 cSig = extractFloat64Frac(c);
4144 cExp = extractFloat64Exp(c);
4145 cSign = extractFloat64Sign(c);
4147 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4148 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4150 /* It is implementation-defined whether the cases of (0,inf,qnan)
4151 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4152 * they return if they do), so we have to hand this information
4153 * off to the target-specific pick-a-NaN routine.
4155 if (((aExp == 0x7ff) && aSig) ||
4156 ((bExp == 0x7ff) && bSig) ||
4157 ((cExp == 0x7ff) && cSig)) {
4158 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4161 if (infzero) {
4162 float_raise(float_flag_invalid STATUS_VAR);
4163 return float64_default_nan;
4166 if (flags & float_muladd_negate_c) {
4167 cSign ^= 1;
4170 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4172 /* Work out the sign and type of the product */
4173 pSign = aSign ^ bSign;
4174 if (flags & float_muladd_negate_product) {
4175 pSign ^= 1;
4177 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4178 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4180 if (cExp == 0x7ff) {
4181 if (pInf && (pSign ^ cSign)) {
4182 /* addition of opposite-signed infinities => InvalidOperation */
4183 float_raise(float_flag_invalid STATUS_VAR);
4184 return float64_default_nan;
4186 /* Otherwise generate an infinity of the same sign */
4187 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4190 if (pInf) {
4191 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4194 if (pZero) {
4195 if (cExp == 0) {
4196 if (cSig == 0) {
4197 /* Adding two exact zeroes */
4198 if (pSign == cSign) {
4199 zSign = pSign;
4200 } else if (STATUS(float_rounding_mode) == float_round_down) {
4201 zSign = 1;
4202 } else {
4203 zSign = 0;
4205 return packFloat64(zSign ^ signflip, 0, 0);
4207 /* Exact zero plus a denorm */
4208 if (STATUS(flush_to_zero)) {
4209 float_raise(float_flag_output_denormal STATUS_VAR);
4210 return packFloat64(cSign ^ signflip, 0, 0);
4213 /* Zero plus something non-zero : just return the something */
4214 if (flags & float_muladd_halve_result) {
4215 if (cExp == 0) {
4216 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4218 /* Subtract one to halve, and one again because roundAndPackFloat64
4219 * wants one less than the true exponent.
4221 cExp -= 2;
4222 cSig = (cSig | 0x0010000000000000ULL) << 10;
4223 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4225 return packFloat64(cSign ^ signflip, cExp, cSig);
4228 if (aExp == 0) {
4229 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4231 if (bExp == 0) {
4232 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4235 /* Calculate the actual result a * b + c */
4237 /* Multiply first; this is easy. */
4238 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4239 * because we want the true exponent, not the "one-less-than"
4240 * flavour that roundAndPackFloat64() takes.
4242 pExp = aExp + bExp - 0x3fe;
4243 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4244 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4245 mul64To128(aSig, bSig, &pSig0, &pSig1);
4246 if ((int64_t)(pSig0 << 1) >= 0) {
4247 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4248 pExp--;
4251 zSign = pSign ^ signflip;
4253 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4254 * bit in position 126.
4256 if (cExp == 0) {
4257 if (!cSig) {
4258 /* Throw out the special case of c being an exact zero now */
4259 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4260 if (flags & float_muladd_halve_result) {
4261 pExp--;
4263 return roundAndPackFloat64(zSign, pExp - 1,
4264 pSig1 STATUS_VAR);
4266 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4269 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4270 * significand of the addend, with the explicit bit in position 126.
4272 cSig0 = cSig << (126 - 64 - 52);
4273 cSig1 = 0;
4274 cSig0 |= LIT64(0x4000000000000000);
4275 expDiff = pExp - cExp;
4277 if (pSign == cSign) {
4278 /* Addition */
4279 if (expDiff > 0) {
4280 /* scale c to match p */
4281 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4282 zExp = pExp;
4283 } else if (expDiff < 0) {
4284 /* scale p to match c */
4285 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4286 zExp = cExp;
4287 } else {
4288 /* no scaling needed */
4289 zExp = cExp;
4291 /* Add significands and make sure explicit bit ends up in posn 126 */
4292 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4293 if ((int64_t)zSig0 < 0) {
4294 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4295 } else {
4296 zExp--;
4298 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4299 if (flags & float_muladd_halve_result) {
4300 zExp--;
4302 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4303 } else {
4304 /* Subtraction */
4305 if (expDiff > 0) {
4306 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4307 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4308 zExp = pExp;
4309 } else if (expDiff < 0) {
4310 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4311 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4312 zExp = cExp;
4313 zSign ^= 1;
4314 } else {
4315 zExp = pExp;
4316 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4317 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4318 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4319 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4320 zSign ^= 1;
4321 } else {
4322 /* Exact zero */
4323 zSign = signflip;
4324 if (STATUS(float_rounding_mode) == float_round_down) {
4325 zSign ^= 1;
4327 return packFloat64(zSign, 0, 0);
4330 --zExp;
4331 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4332 * starting with the significand in a pair of uint64_t.
4334 if (zSig0) {
4335 shiftcount = countLeadingZeros64(zSig0) - 1;
4336 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4337 if (zSig1) {
4338 zSig0 |= 1;
4340 zExp -= shiftcount;
4341 } else {
4342 shiftcount = countLeadingZeros64(zSig1);
4343 if (shiftcount == 0) {
4344 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4345 zExp -= 63;
4346 } else {
4347 shiftcount--;
4348 zSig0 = zSig1 << shiftcount;
4349 zExp -= (shiftcount + 64);
4352 if (flags & float_muladd_halve_result) {
4353 zExp--;
4355 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4359 /*----------------------------------------------------------------------------
4360 | Returns the square root of the double-precision floating-point value `a'.
4361 | The operation is performed according to the IEC/IEEE Standard for Binary
4362 | Floating-Point Arithmetic.
4363 *----------------------------------------------------------------------------*/
4365 float64 float64_sqrt(float64 a, float_status *status)
4367 flag aSign;
4368 int_fast16_t aExp, zExp;
4369 uint64_t aSig, zSig, doubleZSig;
4370 uint64_t rem0, rem1, term0, term1;
4371 a = float64_squash_input_denormal(a STATUS_VAR);
4373 aSig = extractFloat64Frac( a );
4374 aExp = extractFloat64Exp( a );
4375 aSign = extractFloat64Sign( a );
4376 if ( aExp == 0x7FF ) {
4377 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4378 if ( ! aSign ) return a;
4379 float_raise( float_flag_invalid STATUS_VAR);
4380 return float64_default_nan;
4382 if ( aSign ) {
4383 if ( ( aExp | aSig ) == 0 ) return a;
4384 float_raise( float_flag_invalid STATUS_VAR);
4385 return float64_default_nan;
4387 if ( aExp == 0 ) {
4388 if ( aSig == 0 ) return float64_zero;
4389 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4391 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4392 aSig |= LIT64( 0x0010000000000000 );
4393 zSig = estimateSqrt32( aExp, aSig>>21 );
4394 aSig <<= 9 - ( aExp & 1 );
4395 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4396 if ( ( zSig & 0x1FF ) <= 5 ) {
4397 doubleZSig = zSig<<1;
4398 mul64To128( zSig, zSig, &term0, &term1 );
4399 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4400 while ( (int64_t) rem0 < 0 ) {
4401 --zSig;
4402 doubleZSig -= 2;
4403 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4405 zSig |= ( ( rem0 | rem1 ) != 0 );
4407 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4411 /*----------------------------------------------------------------------------
4412 | Returns the binary log of the double-precision floating-point value `a'.
4413 | The operation is performed according to the IEC/IEEE Standard for Binary
4414 | Floating-Point Arithmetic.
4415 *----------------------------------------------------------------------------*/
4416 float64 float64_log2(float64 a, float_status *status)
4418 flag aSign, zSign;
4419 int_fast16_t aExp;
4420 uint64_t aSig, aSig0, aSig1, zSig, i;
4421 a = float64_squash_input_denormal(a STATUS_VAR);
4423 aSig = extractFloat64Frac( a );
4424 aExp = extractFloat64Exp( a );
4425 aSign = extractFloat64Sign( a );
4427 if ( aExp == 0 ) {
4428 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4429 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4431 if ( aSign ) {
4432 float_raise( float_flag_invalid STATUS_VAR);
4433 return float64_default_nan;
4435 if ( aExp == 0x7FF ) {
4436 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4437 return a;
4440 aExp -= 0x3FF;
4441 aSig |= LIT64( 0x0010000000000000 );
4442 zSign = aExp < 0;
4443 zSig = (uint64_t)aExp << 52;
4444 for (i = 1LL << 51; i > 0; i >>= 1) {
4445 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4446 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4447 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4448 aSig >>= 1;
4449 zSig |= i;
4453 if ( zSign )
4454 zSig = -zSig;
4455 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4458 /*----------------------------------------------------------------------------
4459 | Returns 1 if the double-precision floating-point value `a' is equal to the
4460 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4461 | if either operand is a NaN. Otherwise, the comparison is performed
4462 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4463 *----------------------------------------------------------------------------*/
4465 int float64_eq(float64 a, float64 b, float_status *status)
4467 uint64_t av, bv;
4468 a = float64_squash_input_denormal(a STATUS_VAR);
4469 b = float64_squash_input_denormal(b STATUS_VAR);
4471 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4472 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4474 float_raise( float_flag_invalid STATUS_VAR);
4475 return 0;
4477 av = float64_val(a);
4478 bv = float64_val(b);
4479 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4483 /*----------------------------------------------------------------------------
4484 | Returns 1 if the double-precision floating-point value `a' is less than or
4485 | equal to the corresponding value `b', and 0 otherwise. The invalid
4486 | exception is raised if either operand is a NaN. The comparison is performed
4487 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4488 *----------------------------------------------------------------------------*/
4490 int float64_le(float64 a, float64 b, float_status *status)
4492 flag aSign, bSign;
4493 uint64_t av, bv;
4494 a = float64_squash_input_denormal(a STATUS_VAR);
4495 b = float64_squash_input_denormal(b STATUS_VAR);
4497 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4498 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4500 float_raise( float_flag_invalid STATUS_VAR);
4501 return 0;
4503 aSign = extractFloat64Sign( a );
4504 bSign = extractFloat64Sign( b );
4505 av = float64_val(a);
4506 bv = float64_val(b);
4507 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4508 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4512 /*----------------------------------------------------------------------------
4513 | Returns 1 if the double-precision floating-point value `a' is less than
4514 | the corresponding value `b', and 0 otherwise. The invalid exception is
4515 | raised if either operand is a NaN. The comparison is performed according
4516 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4517 *----------------------------------------------------------------------------*/
4519 int float64_lt(float64 a, float64 b, float_status *status)
4521 flag aSign, bSign;
4522 uint64_t av, bv;
4524 a = float64_squash_input_denormal(a STATUS_VAR);
4525 b = float64_squash_input_denormal(b STATUS_VAR);
4526 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4527 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4529 float_raise( float_flag_invalid STATUS_VAR);
4530 return 0;
4532 aSign = extractFloat64Sign( a );
4533 bSign = extractFloat64Sign( b );
4534 av = float64_val(a);
4535 bv = float64_val(b);
4536 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4537 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4541 /*----------------------------------------------------------------------------
4542 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4543 | be compared, and 0 otherwise. The invalid exception is raised if either
4544 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4545 | Standard for Binary Floating-Point Arithmetic.
4546 *----------------------------------------------------------------------------*/
4548 int float64_unordered(float64 a, float64 b, float_status *status)
4550 a = float64_squash_input_denormal(a STATUS_VAR);
4551 b = float64_squash_input_denormal(b STATUS_VAR);
4553 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4554 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4556 float_raise( float_flag_invalid STATUS_VAR);
4557 return 1;
4559 return 0;
4562 /*----------------------------------------------------------------------------
4563 | Returns 1 if the double-precision floating-point value `a' is equal to the
4564 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4565 | exception.The comparison is performed according to the IEC/IEEE Standard
4566 | for Binary Floating-Point Arithmetic.
4567 *----------------------------------------------------------------------------*/
4569 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4571 uint64_t av, bv;
4572 a = float64_squash_input_denormal(a STATUS_VAR);
4573 b = float64_squash_input_denormal(b STATUS_VAR);
4575 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4576 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4578 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4579 float_raise( float_flag_invalid STATUS_VAR);
4581 return 0;
4583 av = float64_val(a);
4584 bv = float64_val(b);
4585 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4589 /*----------------------------------------------------------------------------
4590 | Returns 1 if the double-precision floating-point value `a' is less than or
4591 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4592 | cause an exception. Otherwise, the comparison is performed according to the
4593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4594 *----------------------------------------------------------------------------*/
4596 int float64_le_quiet(float64 a, float64 b, float_status *status)
4598 flag aSign, bSign;
4599 uint64_t av, bv;
4600 a = float64_squash_input_denormal(a STATUS_VAR);
4601 b = float64_squash_input_denormal(b STATUS_VAR);
4603 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4604 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4606 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4607 float_raise( float_flag_invalid STATUS_VAR);
4609 return 0;
4611 aSign = extractFloat64Sign( a );
4612 bSign = extractFloat64Sign( b );
4613 av = float64_val(a);
4614 bv = float64_val(b);
4615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4616 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4620 /*----------------------------------------------------------------------------
4621 | Returns 1 if the double-precision floating-point value `a' is less than
4622 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4623 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4624 | Standard for Binary Floating-Point Arithmetic.
4625 *----------------------------------------------------------------------------*/
4627 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4629 flag aSign, bSign;
4630 uint64_t av, bv;
4631 a = float64_squash_input_denormal(a STATUS_VAR);
4632 b = float64_squash_input_denormal(b STATUS_VAR);
4634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4637 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4638 float_raise( float_flag_invalid STATUS_VAR);
4640 return 0;
4642 aSign = extractFloat64Sign( a );
4643 bSign = extractFloat64Sign( b );
4644 av = float64_val(a);
4645 bv = float64_val(b);
4646 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4647 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4651 /*----------------------------------------------------------------------------
4652 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4653 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4654 | comparison is performed according to the IEC/IEEE Standard for Binary
4655 | Floating-Point Arithmetic.
4656 *----------------------------------------------------------------------------*/
4658 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4660 a = float64_squash_input_denormal(a STATUS_VAR);
4661 b = float64_squash_input_denormal(b STATUS_VAR);
4663 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4664 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4666 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4667 float_raise( float_flag_invalid STATUS_VAR);
4669 return 1;
4671 return 0;
4674 /*----------------------------------------------------------------------------
4675 | Returns the result of converting the extended double-precision floating-
4676 | point value `a' to the 32-bit two's complement integer format. The
4677 | conversion is performed according to the IEC/IEEE Standard for Binary
4678 | Floating-Point Arithmetic---which means in particular that the conversion
4679 | is rounded according to the current rounding mode. If `a' is a NaN, the
4680 | largest positive integer is returned. Otherwise, if the conversion
4681 | overflows, the largest integer with the same sign as `a' is returned.
4682 *----------------------------------------------------------------------------*/
4684 int32 floatx80_to_int32(floatx80 a, float_status *status)
4686 flag aSign;
4687 int32 aExp, shiftCount;
4688 uint64_t aSig;
4690 aSig = extractFloatx80Frac( a );
4691 aExp = extractFloatx80Exp( a );
4692 aSign = extractFloatx80Sign( a );
4693 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4694 shiftCount = 0x4037 - aExp;
4695 if ( shiftCount <= 0 ) shiftCount = 1;
4696 shift64RightJamming( aSig, shiftCount, &aSig );
4697 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4701 /*----------------------------------------------------------------------------
4702 | Returns the result of converting the extended double-precision floating-
4703 | point value `a' to the 32-bit two's complement integer format. The
4704 | conversion is performed according to the IEC/IEEE Standard for Binary
4705 | Floating-Point Arithmetic, except that the conversion is always rounded
4706 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4707 | Otherwise, if the conversion overflows, the largest integer with the same
4708 | sign as `a' is returned.
4709 *----------------------------------------------------------------------------*/
4711 int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4713 flag aSign;
4714 int32 aExp, shiftCount;
4715 uint64_t aSig, savedASig;
4716 int32_t z;
4718 aSig = extractFloatx80Frac( a );
4719 aExp = extractFloatx80Exp( a );
4720 aSign = extractFloatx80Sign( a );
4721 if ( 0x401E < aExp ) {
4722 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4723 goto invalid;
4725 else if ( aExp < 0x3FFF ) {
4726 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4727 return 0;
4729 shiftCount = 0x403E - aExp;
4730 savedASig = aSig;
4731 aSig >>= shiftCount;
4732 z = aSig;
4733 if ( aSign ) z = - z;
4734 if ( ( z < 0 ) ^ aSign ) {
4735 invalid:
4736 float_raise( float_flag_invalid STATUS_VAR);
4737 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4739 if ( ( aSig<<shiftCount ) != savedASig ) {
4740 STATUS(float_exception_flags) |= float_flag_inexact;
4742 return z;
4746 /*----------------------------------------------------------------------------
4747 | Returns the result of converting the extended double-precision floating-
4748 | point value `a' to the 64-bit two's complement integer format. The
4749 | conversion is performed according to the IEC/IEEE Standard for Binary
4750 | Floating-Point Arithmetic---which means in particular that the conversion
4751 | is rounded according to the current rounding mode. If `a' is a NaN,
4752 | the largest positive integer is returned. Otherwise, if the conversion
4753 | overflows, the largest integer with the same sign as `a' is returned.
4754 *----------------------------------------------------------------------------*/
4756 int64 floatx80_to_int64(floatx80 a, float_status *status)
4758 flag aSign;
4759 int32 aExp, shiftCount;
4760 uint64_t aSig, aSigExtra;
4762 aSig = extractFloatx80Frac( a );
4763 aExp = extractFloatx80Exp( a );
4764 aSign = extractFloatx80Sign( a );
4765 shiftCount = 0x403E - aExp;
4766 if ( shiftCount <= 0 ) {
4767 if ( shiftCount ) {
4768 float_raise( float_flag_invalid STATUS_VAR);
4769 if ( ! aSign
4770 || ( ( aExp == 0x7FFF )
4771 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4773 return LIT64( 0x7FFFFFFFFFFFFFFF );
4775 return (int64_t) LIT64( 0x8000000000000000 );
4777 aSigExtra = 0;
4779 else {
4780 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4782 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4786 /*----------------------------------------------------------------------------
4787 | Returns the result of converting the extended double-precision floating-
4788 | point value `a' to the 64-bit two's complement integer format. The
4789 | conversion is performed according to the IEC/IEEE Standard for Binary
4790 | Floating-Point Arithmetic, except that the conversion is always rounded
4791 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4792 | Otherwise, if the conversion overflows, the largest integer with the same
4793 | sign as `a' is returned.
4794 *----------------------------------------------------------------------------*/
4796 int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4798 flag aSign;
4799 int32 aExp, shiftCount;
4800 uint64_t aSig;
4801 int64 z;
4803 aSig = extractFloatx80Frac( a );
4804 aExp = extractFloatx80Exp( a );
4805 aSign = extractFloatx80Sign( a );
4806 shiftCount = aExp - 0x403E;
4807 if ( 0 <= shiftCount ) {
4808 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4809 if ( ( a.high != 0xC03E ) || aSig ) {
4810 float_raise( float_flag_invalid STATUS_VAR);
4811 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4812 return LIT64( 0x7FFFFFFFFFFFFFFF );
4815 return (int64_t) LIT64( 0x8000000000000000 );
4817 else if ( aExp < 0x3FFF ) {
4818 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4819 return 0;
4821 z = aSig>>( - shiftCount );
4822 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4823 STATUS(float_exception_flags) |= float_flag_inexact;
4825 if ( aSign ) z = - z;
4826 return z;
4830 /*----------------------------------------------------------------------------
4831 | Returns the result of converting the extended double-precision floating-
4832 | point value `a' to the single-precision floating-point format. The
4833 | conversion is performed according to the IEC/IEEE Standard for Binary
4834 | Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4837 float32 floatx80_to_float32(floatx80 a, float_status *status)
4839 flag aSign;
4840 int32 aExp;
4841 uint64_t aSig;
4843 aSig = extractFloatx80Frac( a );
4844 aExp = extractFloatx80Exp( a );
4845 aSign = extractFloatx80Sign( a );
4846 if ( aExp == 0x7FFF ) {
4847 if ( (uint64_t) ( aSig<<1 ) ) {
4848 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4850 return packFloat32( aSign, 0xFF, 0 );
4852 shift64RightJamming( aSig, 33, &aSig );
4853 if ( aExp || aSig ) aExp -= 0x3F81;
4854 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4858 /*----------------------------------------------------------------------------
4859 | Returns the result of converting the extended double-precision floating-
4860 | point value `a' to the double-precision floating-point format. The
4861 | conversion is performed according to the IEC/IEEE Standard for Binary
4862 | Floating-Point Arithmetic.
4863 *----------------------------------------------------------------------------*/
4865 float64 floatx80_to_float64(floatx80 a, float_status *status)
4867 flag aSign;
4868 int32 aExp;
4869 uint64_t aSig, zSig;
4871 aSig = extractFloatx80Frac( a );
4872 aExp = extractFloatx80Exp( a );
4873 aSign = extractFloatx80Sign( a );
4874 if ( aExp == 0x7FFF ) {
4875 if ( (uint64_t) ( aSig<<1 ) ) {
4876 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4878 return packFloat64( aSign, 0x7FF, 0 );
4880 shift64RightJamming( aSig, 1, &zSig );
4881 if ( aExp || aSig ) aExp -= 0x3C01;
4882 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4886 /*----------------------------------------------------------------------------
4887 | Returns the result of converting the extended double-precision floating-
4888 | point value `a' to the quadruple-precision floating-point format. The
4889 | conversion is performed according to the IEC/IEEE Standard for Binary
4890 | Floating-Point Arithmetic.
4891 *----------------------------------------------------------------------------*/
4893 float128 floatx80_to_float128(floatx80 a, float_status *status)
4895 flag aSign;
4896 int_fast16_t aExp;
4897 uint64_t aSig, zSig0, zSig1;
4899 aSig = extractFloatx80Frac( a );
4900 aExp = extractFloatx80Exp( a );
4901 aSign = extractFloatx80Sign( a );
4902 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4903 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4905 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4906 return packFloat128( aSign, aExp, zSig0, zSig1 );
4910 /*----------------------------------------------------------------------------
4911 | Rounds the extended double-precision floating-point value `a' to an integer,
4912 | and returns the result as an extended quadruple-precision floating-point
4913 | value. The operation is performed according to the IEC/IEEE Standard for
4914 | Binary Floating-Point Arithmetic.
4915 *----------------------------------------------------------------------------*/
4917 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4919 flag aSign;
4920 int32 aExp;
4921 uint64_t lastBitMask, roundBitsMask;
4922 floatx80 z;
4924 aExp = extractFloatx80Exp( a );
4925 if ( 0x403E <= aExp ) {
4926 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4927 return propagateFloatx80NaN( a, a STATUS_VAR );
4929 return a;
4931 if ( aExp < 0x3FFF ) {
4932 if ( ( aExp == 0 )
4933 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4934 return a;
4936 STATUS(float_exception_flags) |= float_flag_inexact;
4937 aSign = extractFloatx80Sign( a );
4938 switch ( STATUS(float_rounding_mode) ) {
4939 case float_round_nearest_even:
4940 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4942 return
4943 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4945 break;
4946 case float_round_ties_away:
4947 if (aExp == 0x3FFE) {
4948 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4950 break;
4951 case float_round_down:
4952 return
4953 aSign ?
4954 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4955 : packFloatx80( 0, 0, 0 );
4956 case float_round_up:
4957 return
4958 aSign ? packFloatx80( 1, 0, 0 )
4959 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4961 return packFloatx80( aSign, 0, 0 );
4963 lastBitMask = 1;
4964 lastBitMask <<= 0x403E - aExp;
4965 roundBitsMask = lastBitMask - 1;
4966 z = a;
4967 switch (STATUS(float_rounding_mode)) {
4968 case float_round_nearest_even:
4969 z.low += lastBitMask>>1;
4970 if ((z.low & roundBitsMask) == 0) {
4971 z.low &= ~lastBitMask;
4973 break;
4974 case float_round_ties_away:
4975 z.low += lastBitMask >> 1;
4976 break;
4977 case float_round_to_zero:
4978 break;
4979 case float_round_up:
4980 if (!extractFloatx80Sign(z)) {
4981 z.low += roundBitsMask;
4983 break;
4984 case float_round_down:
4985 if (extractFloatx80Sign(z)) {
4986 z.low += roundBitsMask;
4988 break;
4989 default:
4990 abort();
4992 z.low &= ~ roundBitsMask;
4993 if ( z.low == 0 ) {
4994 ++z.high;
4995 z.low = LIT64( 0x8000000000000000 );
4997 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4998 return z;
5002 /*----------------------------------------------------------------------------
5003 | Returns the result of adding the absolute values of the extended double-
5004 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5005 | negated before being returned. `zSign' is ignored if the result is a NaN.
5006 | The addition is performed according to the IEC/IEEE Standard for Binary
5007 | Floating-Point Arithmetic.
5008 *----------------------------------------------------------------------------*/
5010 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5011 float_status *status)
5013 int32 aExp, bExp, zExp;
5014 uint64_t aSig, bSig, zSig0, zSig1;
5015 int32 expDiff;
5017 aSig = extractFloatx80Frac( a );
5018 aExp = extractFloatx80Exp( a );
5019 bSig = extractFloatx80Frac( b );
5020 bExp = extractFloatx80Exp( b );
5021 expDiff = aExp - bExp;
5022 if ( 0 < expDiff ) {
5023 if ( aExp == 0x7FFF ) {
5024 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5025 return a;
5027 if ( bExp == 0 ) --expDiff;
5028 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5029 zExp = aExp;
5031 else if ( expDiff < 0 ) {
5032 if ( bExp == 0x7FFF ) {
5033 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5034 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5036 if ( aExp == 0 ) ++expDiff;
5037 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5038 zExp = bExp;
5040 else {
5041 if ( aExp == 0x7FFF ) {
5042 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5043 return propagateFloatx80NaN( a, b STATUS_VAR );
5045 return a;
5047 zSig1 = 0;
5048 zSig0 = aSig + bSig;
5049 if ( aExp == 0 ) {
5050 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5051 goto roundAndPack;
5053 zExp = aExp;
5054 goto shiftRight1;
5056 zSig0 = aSig + bSig;
5057 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5058 shiftRight1:
5059 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5060 zSig0 |= LIT64( 0x8000000000000000 );
5061 ++zExp;
5062 roundAndPack:
5063 return
5064 roundAndPackFloatx80(
5065 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5069 /*----------------------------------------------------------------------------
5070 | Returns the result of subtracting the absolute values of the extended
5071 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5072 | difference is negated before being returned. `zSign' is ignored if the
5073 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5074 | Standard for Binary Floating-Point Arithmetic.
5075 *----------------------------------------------------------------------------*/
5077 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5078 float_status *status)
5080 int32 aExp, bExp, zExp;
5081 uint64_t aSig, bSig, zSig0, zSig1;
5082 int32 expDiff;
5083 floatx80 z;
5085 aSig = extractFloatx80Frac( a );
5086 aExp = extractFloatx80Exp( a );
5087 bSig = extractFloatx80Frac( b );
5088 bExp = extractFloatx80Exp( b );
5089 expDiff = aExp - bExp;
5090 if ( 0 < expDiff ) goto aExpBigger;
5091 if ( expDiff < 0 ) goto bExpBigger;
5092 if ( aExp == 0x7FFF ) {
5093 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5094 return propagateFloatx80NaN( a, b STATUS_VAR );
5096 float_raise( float_flag_invalid STATUS_VAR);
5097 z.low = floatx80_default_nan_low;
5098 z.high = floatx80_default_nan_high;
5099 return z;
5101 if ( aExp == 0 ) {
5102 aExp = 1;
5103 bExp = 1;
5105 zSig1 = 0;
5106 if ( bSig < aSig ) goto aBigger;
5107 if ( aSig < bSig ) goto bBigger;
5108 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5109 bExpBigger:
5110 if ( bExp == 0x7FFF ) {
5111 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5112 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5114 if ( aExp == 0 ) ++expDiff;
5115 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5116 bBigger:
5117 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5118 zExp = bExp;
5119 zSign ^= 1;
5120 goto normalizeRoundAndPack;
5121 aExpBigger:
5122 if ( aExp == 0x7FFF ) {
5123 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5124 return a;
5126 if ( bExp == 0 ) --expDiff;
5127 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5128 aBigger:
5129 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5130 zExp = aExp;
5131 normalizeRoundAndPack:
5132 return
5133 normalizeRoundAndPackFloatx80(
5134 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5138 /*----------------------------------------------------------------------------
5139 | Returns the result of adding the extended double-precision floating-point
5140 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5141 | Standard for Binary Floating-Point Arithmetic.
5142 *----------------------------------------------------------------------------*/
5144 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5146 flag aSign, bSign;
5148 aSign = extractFloatx80Sign( a );
5149 bSign = extractFloatx80Sign( b );
5150 if ( aSign == bSign ) {
5151 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5153 else {
5154 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5159 /*----------------------------------------------------------------------------
5160 | Returns the result of subtracting the extended double-precision floating-
5161 | point values `a' and `b'. The operation is performed according to the
5162 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163 *----------------------------------------------------------------------------*/
5165 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5167 flag aSign, bSign;
5169 aSign = extractFloatx80Sign( a );
5170 bSign = extractFloatx80Sign( b );
5171 if ( aSign == bSign ) {
5172 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5174 else {
5175 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5180 /*----------------------------------------------------------------------------
5181 | Returns the result of multiplying the extended double-precision floating-
5182 | point values `a' and `b'. The operation is performed according to the
5183 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184 *----------------------------------------------------------------------------*/
5186 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5188 flag aSign, bSign, zSign;
5189 int32 aExp, bExp, zExp;
5190 uint64_t aSig, bSig, zSig0, zSig1;
5191 floatx80 z;
5193 aSig = extractFloatx80Frac( a );
5194 aExp = extractFloatx80Exp( a );
5195 aSign = extractFloatx80Sign( a );
5196 bSig = extractFloatx80Frac( b );
5197 bExp = extractFloatx80Exp( b );
5198 bSign = extractFloatx80Sign( b );
5199 zSign = aSign ^ bSign;
5200 if ( aExp == 0x7FFF ) {
5201 if ( (uint64_t) ( aSig<<1 )
5202 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5203 return propagateFloatx80NaN( a, b STATUS_VAR );
5205 if ( ( bExp | bSig ) == 0 ) goto invalid;
5206 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5208 if ( bExp == 0x7FFF ) {
5209 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5210 if ( ( aExp | aSig ) == 0 ) {
5211 invalid:
5212 float_raise( float_flag_invalid STATUS_VAR);
5213 z.low = floatx80_default_nan_low;
5214 z.high = floatx80_default_nan_high;
5215 return z;
5217 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5219 if ( aExp == 0 ) {
5220 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5221 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5223 if ( bExp == 0 ) {
5224 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5225 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5227 zExp = aExp + bExp - 0x3FFE;
5228 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5229 if ( 0 < (int64_t) zSig0 ) {
5230 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5231 --zExp;
5233 return
5234 roundAndPackFloatx80(
5235 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5239 /*----------------------------------------------------------------------------
5240 | Returns the result of dividing the extended double-precision floating-point
5241 | value `a' by the corresponding value `b'. The operation is performed
5242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5243 *----------------------------------------------------------------------------*/
5245 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5247 flag aSign, bSign, zSign;
5248 int32 aExp, bExp, zExp;
5249 uint64_t aSig, bSig, zSig0, zSig1;
5250 uint64_t rem0, rem1, rem2, term0, term1, term2;
5251 floatx80 z;
5253 aSig = extractFloatx80Frac( a );
5254 aExp = extractFloatx80Exp( a );
5255 aSign = extractFloatx80Sign( a );
5256 bSig = extractFloatx80Frac( b );
5257 bExp = extractFloatx80Exp( b );
5258 bSign = extractFloatx80Sign( b );
5259 zSign = aSign ^ bSign;
5260 if ( aExp == 0x7FFF ) {
5261 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5262 if ( bExp == 0x7FFF ) {
5263 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5264 goto invalid;
5266 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5268 if ( bExp == 0x7FFF ) {
5269 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5270 return packFloatx80( zSign, 0, 0 );
5272 if ( bExp == 0 ) {
5273 if ( bSig == 0 ) {
5274 if ( ( aExp | aSig ) == 0 ) {
5275 invalid:
5276 float_raise( float_flag_invalid STATUS_VAR);
5277 z.low = floatx80_default_nan_low;
5278 z.high = floatx80_default_nan_high;
5279 return z;
5281 float_raise( float_flag_divbyzero STATUS_VAR);
5282 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5284 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5286 if ( aExp == 0 ) {
5287 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5288 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5290 zExp = aExp - bExp + 0x3FFE;
5291 rem1 = 0;
5292 if ( bSig <= aSig ) {
5293 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5294 ++zExp;
5296 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5297 mul64To128( bSig, zSig0, &term0, &term1 );
5298 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5299 while ( (int64_t) rem0 < 0 ) {
5300 --zSig0;
5301 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5303 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5304 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5305 mul64To128( bSig, zSig1, &term1, &term2 );
5306 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5307 while ( (int64_t) rem1 < 0 ) {
5308 --zSig1;
5309 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5311 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5313 return
5314 roundAndPackFloatx80(
5315 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5319 /*----------------------------------------------------------------------------
5320 | Returns the remainder of the extended double-precision floating-point value
5321 | `a' with respect to the corresponding value `b'. The operation is performed
5322 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5323 *----------------------------------------------------------------------------*/
5325 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5327 flag aSign, zSign;
5328 int32 aExp, bExp, expDiff;
5329 uint64_t aSig0, aSig1, bSig;
5330 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5331 floatx80 z;
5333 aSig0 = extractFloatx80Frac( a );
5334 aExp = extractFloatx80Exp( a );
5335 aSign = extractFloatx80Sign( a );
5336 bSig = extractFloatx80Frac( b );
5337 bExp = extractFloatx80Exp( b );
5338 if ( aExp == 0x7FFF ) {
5339 if ( (uint64_t) ( aSig0<<1 )
5340 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5341 return propagateFloatx80NaN( a, b STATUS_VAR );
5343 goto invalid;
5345 if ( bExp == 0x7FFF ) {
5346 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5347 return a;
5349 if ( bExp == 0 ) {
5350 if ( bSig == 0 ) {
5351 invalid:
5352 float_raise( float_flag_invalid STATUS_VAR);
5353 z.low = floatx80_default_nan_low;
5354 z.high = floatx80_default_nan_high;
5355 return z;
5357 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5359 if ( aExp == 0 ) {
5360 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5361 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5363 bSig |= LIT64( 0x8000000000000000 );
5364 zSign = aSign;
5365 expDiff = aExp - bExp;
5366 aSig1 = 0;
5367 if ( expDiff < 0 ) {
5368 if ( expDiff < -1 ) return a;
5369 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5370 expDiff = 0;
5372 q = ( bSig <= aSig0 );
5373 if ( q ) aSig0 -= bSig;
5374 expDiff -= 64;
5375 while ( 0 < expDiff ) {
5376 q = estimateDiv128To64( aSig0, aSig1, bSig );
5377 q = ( 2 < q ) ? q - 2 : 0;
5378 mul64To128( bSig, q, &term0, &term1 );
5379 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5380 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5381 expDiff -= 62;
5383 expDiff += 64;
5384 if ( 0 < expDiff ) {
5385 q = estimateDiv128To64( aSig0, aSig1, bSig );
5386 q = ( 2 < q ) ? q - 2 : 0;
5387 q >>= 64 - expDiff;
5388 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5389 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5390 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5391 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5392 ++q;
5393 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5396 else {
5397 term1 = 0;
5398 term0 = bSig;
5400 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5401 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5402 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5403 && ( q & 1 ) )
5405 aSig0 = alternateASig0;
5406 aSig1 = alternateASig1;
5407 zSign = ! zSign;
5409 return
5410 normalizeRoundAndPackFloatx80(
5411 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5415 /*----------------------------------------------------------------------------
5416 | Returns the square root of the extended double-precision floating-point
5417 | value `a'. The operation is performed according to the IEC/IEEE Standard
5418 | for Binary Floating-Point Arithmetic.
5419 *----------------------------------------------------------------------------*/
5421 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5423 flag aSign;
5424 int32 aExp, zExp;
5425 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5426 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5427 floatx80 z;
5429 aSig0 = extractFloatx80Frac( a );
5430 aExp = extractFloatx80Exp( a );
5431 aSign = extractFloatx80Sign( a );
5432 if ( aExp == 0x7FFF ) {
5433 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5434 if ( ! aSign ) return a;
5435 goto invalid;
5437 if ( aSign ) {
5438 if ( ( aExp | aSig0 ) == 0 ) return a;
5439 invalid:
5440 float_raise( float_flag_invalid STATUS_VAR);
5441 z.low = floatx80_default_nan_low;
5442 z.high = floatx80_default_nan_high;
5443 return z;
5445 if ( aExp == 0 ) {
5446 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5447 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5449 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5450 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5451 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5452 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5453 doubleZSig0 = zSig0<<1;
5454 mul64To128( zSig0, zSig0, &term0, &term1 );
5455 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5456 while ( (int64_t) rem0 < 0 ) {
5457 --zSig0;
5458 doubleZSig0 -= 2;
5459 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5461 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5462 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5463 if ( zSig1 == 0 ) zSig1 = 1;
5464 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5465 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5466 mul64To128( zSig1, zSig1, &term2, &term3 );
5467 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5468 while ( (int64_t) rem1 < 0 ) {
5469 --zSig1;
5470 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5471 term3 |= 1;
5472 term2 |= doubleZSig0;
5473 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5475 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5477 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5478 zSig0 |= doubleZSig0;
5479 return
5480 roundAndPackFloatx80(
5481 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5485 /*----------------------------------------------------------------------------
5486 | Returns 1 if the extended double-precision floating-point value `a' is equal
5487 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5488 | raised if either operand is a NaN. Otherwise, the comparison is performed
5489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5492 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5495 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5496 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5497 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5498 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5500 float_raise( float_flag_invalid STATUS_VAR);
5501 return 0;
5503 return
5504 ( a.low == b.low )
5505 && ( ( a.high == b.high )
5506 || ( ( a.low == 0 )
5507 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5512 /*----------------------------------------------------------------------------
5513 | Returns 1 if the extended double-precision floating-point value `a' is
5514 | less than or equal to the corresponding value `b', and 0 otherwise. The
5515 | invalid exception is raised if either operand is a NaN. The comparison is
5516 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5517 | Arithmetic.
5518 *----------------------------------------------------------------------------*/
5520 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5522 flag aSign, bSign;
5524 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5525 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5526 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5527 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5529 float_raise( float_flag_invalid STATUS_VAR);
5530 return 0;
5532 aSign = extractFloatx80Sign( a );
5533 bSign = extractFloatx80Sign( b );
5534 if ( aSign != bSign ) {
5535 return
5536 aSign
5537 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5538 == 0 );
5540 return
5541 aSign ? le128( b.high, b.low, a.high, a.low )
5542 : le128( a.high, a.low, b.high, b.low );
5546 /*----------------------------------------------------------------------------
5547 | Returns 1 if the extended double-precision floating-point value `a' is
5548 | less than the corresponding value `b', and 0 otherwise. The invalid
5549 | exception is raised if either operand is a NaN. The comparison is performed
5550 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5551 *----------------------------------------------------------------------------*/
5553 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5555 flag aSign, bSign;
5557 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5558 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5559 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5560 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5562 float_raise( float_flag_invalid STATUS_VAR);
5563 return 0;
5565 aSign = extractFloatx80Sign( a );
5566 bSign = extractFloatx80Sign( b );
5567 if ( aSign != bSign ) {
5568 return
5569 aSign
5570 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5571 != 0 );
5573 return
5574 aSign ? lt128( b.high, b.low, a.high, a.low )
5575 : lt128( a.high, a.low, b.high, b.low );
5579 /*----------------------------------------------------------------------------
5580 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5581 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5582 | either operand is a NaN. The comparison is performed according to the
5583 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5584 *----------------------------------------------------------------------------*/
5585 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5587 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5588 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5589 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5590 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5592 float_raise( float_flag_invalid STATUS_VAR);
5593 return 1;
5595 return 0;
5598 /*----------------------------------------------------------------------------
5599 | Returns 1 if the extended double-precision floating-point value `a' is
5600 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5601 | cause an exception. The comparison is performed according to the IEC/IEEE
5602 | Standard for Binary Floating-Point Arithmetic.
5603 *----------------------------------------------------------------------------*/
5605 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5608 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5609 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5610 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5611 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5613 if ( floatx80_is_signaling_nan( a )
5614 || floatx80_is_signaling_nan( b ) ) {
5615 float_raise( float_flag_invalid STATUS_VAR);
5617 return 0;
5619 return
5620 ( a.low == b.low )
5621 && ( ( a.high == b.high )
5622 || ( ( a.low == 0 )
5623 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5628 /*----------------------------------------------------------------------------
5629 | Returns 1 if the extended double-precision floating-point value `a' is less
5630 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5631 | do not cause an exception. Otherwise, the comparison is performed according
5632 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5633 *----------------------------------------------------------------------------*/
5635 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5637 flag aSign, bSign;
5639 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5640 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5641 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5642 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5644 if ( floatx80_is_signaling_nan( a )
5645 || floatx80_is_signaling_nan( b ) ) {
5646 float_raise( float_flag_invalid STATUS_VAR);
5648 return 0;
5650 aSign = extractFloatx80Sign( a );
5651 bSign = extractFloatx80Sign( b );
5652 if ( aSign != bSign ) {
5653 return
5654 aSign
5655 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5656 == 0 );
5658 return
5659 aSign ? le128( b.high, b.low, a.high, a.low )
5660 : le128( a.high, a.low, b.high, b.low );
5664 /*----------------------------------------------------------------------------
5665 | Returns 1 if the extended double-precision floating-point value `a' is less
5666 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5667 | an exception. Otherwise, the comparison is performed according to the
5668 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5669 *----------------------------------------------------------------------------*/
5671 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5673 flag aSign, bSign;
5675 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5676 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5677 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5678 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5680 if ( floatx80_is_signaling_nan( a )
5681 || floatx80_is_signaling_nan( b ) ) {
5682 float_raise( float_flag_invalid STATUS_VAR);
5684 return 0;
5686 aSign = extractFloatx80Sign( a );
5687 bSign = extractFloatx80Sign( b );
5688 if ( aSign != bSign ) {
5689 return
5690 aSign
5691 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5692 != 0 );
5694 return
5695 aSign ? lt128( b.high, b.low, a.high, a.low )
5696 : lt128( a.high, a.low, b.high, b.low );
5700 /*----------------------------------------------------------------------------
5701 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5702 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5703 | The comparison is performed according to the IEC/IEEE Standard for Binary
5704 | Floating-Point Arithmetic.
5705 *----------------------------------------------------------------------------*/
5706 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5708 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5709 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5710 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5711 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5713 if ( floatx80_is_signaling_nan( a )
5714 || floatx80_is_signaling_nan( b ) ) {
5715 float_raise( float_flag_invalid STATUS_VAR);
5717 return 1;
5719 return 0;
5722 /*----------------------------------------------------------------------------
5723 | Returns the result of converting the quadruple-precision floating-point
5724 | value `a' to the 32-bit two's complement integer format. The conversion
5725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5726 | Arithmetic---which means in particular that the conversion is rounded
5727 | according to the current rounding mode. If `a' is a NaN, the largest
5728 | positive integer is returned. Otherwise, if the conversion overflows, the
5729 | largest integer with the same sign as `a' is returned.
5730 *----------------------------------------------------------------------------*/
5732 int32 float128_to_int32(float128 a, float_status *status)
5734 flag aSign;
5735 int32 aExp, shiftCount;
5736 uint64_t aSig0, aSig1;
5738 aSig1 = extractFloat128Frac1( a );
5739 aSig0 = extractFloat128Frac0( a );
5740 aExp = extractFloat128Exp( a );
5741 aSign = extractFloat128Sign( a );
5742 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5743 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5744 aSig0 |= ( aSig1 != 0 );
5745 shiftCount = 0x4028 - aExp;
5746 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5747 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5751 /*----------------------------------------------------------------------------
5752 | Returns the result of converting the quadruple-precision floating-point
5753 | value `a' to the 32-bit two's complement integer format. The conversion
5754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5755 | Arithmetic, except that the conversion is always rounded toward zero. If
5756 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5757 | conversion overflows, the largest integer with the same sign as `a' is
5758 | returned.
5759 *----------------------------------------------------------------------------*/
5761 int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
5763 flag aSign;
5764 int32 aExp, shiftCount;
5765 uint64_t aSig0, aSig1, savedASig;
5766 int32_t z;
5768 aSig1 = extractFloat128Frac1( a );
5769 aSig0 = extractFloat128Frac0( a );
5770 aExp = extractFloat128Exp( a );
5771 aSign = extractFloat128Sign( a );
5772 aSig0 |= ( aSig1 != 0 );
5773 if ( 0x401E < aExp ) {
5774 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5775 goto invalid;
5777 else if ( aExp < 0x3FFF ) {
5778 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5779 return 0;
5781 aSig0 |= LIT64( 0x0001000000000000 );
5782 shiftCount = 0x402F - aExp;
5783 savedASig = aSig0;
5784 aSig0 >>= shiftCount;
5785 z = aSig0;
5786 if ( aSign ) z = - z;
5787 if ( ( z < 0 ) ^ aSign ) {
5788 invalid:
5789 float_raise( float_flag_invalid STATUS_VAR);
5790 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5792 if ( ( aSig0<<shiftCount ) != savedASig ) {
5793 STATUS(float_exception_flags) |= float_flag_inexact;
5795 return z;
5799 /*----------------------------------------------------------------------------
5800 | Returns the result of converting the quadruple-precision floating-point
5801 | value `a' to the 64-bit two's complement integer format. The conversion
5802 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5803 | Arithmetic---which means in particular that the conversion is rounded
5804 | according to the current rounding mode. If `a' is a NaN, the largest
5805 | positive integer is returned. Otherwise, if the conversion overflows, the
5806 | largest integer with the same sign as `a' is returned.
5807 *----------------------------------------------------------------------------*/
5809 int64 float128_to_int64(float128 a, float_status *status)
5811 flag aSign;
5812 int32 aExp, shiftCount;
5813 uint64_t aSig0, aSig1;
5815 aSig1 = extractFloat128Frac1( a );
5816 aSig0 = extractFloat128Frac0( a );
5817 aExp = extractFloat128Exp( a );
5818 aSign = extractFloat128Sign( a );
5819 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5820 shiftCount = 0x402F - aExp;
5821 if ( shiftCount <= 0 ) {
5822 if ( 0x403E < aExp ) {
5823 float_raise( float_flag_invalid STATUS_VAR);
5824 if ( ! aSign
5825 || ( ( aExp == 0x7FFF )
5826 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5829 return LIT64( 0x7FFFFFFFFFFFFFFF );
5831 return (int64_t) LIT64( 0x8000000000000000 );
5833 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5835 else {
5836 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5838 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of converting the quadruple-precision floating-point
5844 | value `a' to the 64-bit two's complement integer format. The conversion
5845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5846 | Arithmetic, except that the conversion is always rounded toward zero.
5847 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5848 | the conversion overflows, the largest integer with the same sign as `a' is
5849 | returned.
5850 *----------------------------------------------------------------------------*/
5852 int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
5854 flag aSign;
5855 int32 aExp, shiftCount;
5856 uint64_t aSig0, aSig1;
5857 int64 z;
5859 aSig1 = extractFloat128Frac1( a );
5860 aSig0 = extractFloat128Frac0( a );
5861 aExp = extractFloat128Exp( a );
5862 aSign = extractFloat128Sign( a );
5863 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5864 shiftCount = aExp - 0x402F;
5865 if ( 0 < shiftCount ) {
5866 if ( 0x403E <= aExp ) {
5867 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5868 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5869 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5870 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5872 else {
5873 float_raise( float_flag_invalid STATUS_VAR);
5874 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5875 return LIT64( 0x7FFFFFFFFFFFFFFF );
5878 return (int64_t) LIT64( 0x8000000000000000 );
5880 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5881 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5882 STATUS(float_exception_flags) |= float_flag_inexact;
5885 else {
5886 if ( aExp < 0x3FFF ) {
5887 if ( aExp | aSig0 | aSig1 ) {
5888 STATUS(float_exception_flags) |= float_flag_inexact;
5890 return 0;
5892 z = aSig0>>( - shiftCount );
5893 if ( aSig1
5894 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5895 STATUS(float_exception_flags) |= float_flag_inexact;
5898 if ( aSign ) z = - z;
5899 return z;
5903 /*----------------------------------------------------------------------------
5904 | Returns the result of converting the quadruple-precision floating-point
5905 | value `a' to the single-precision floating-point format. The conversion
5906 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5907 | Arithmetic.
5908 *----------------------------------------------------------------------------*/
5910 float32 float128_to_float32(float128 a, float_status *status)
5912 flag aSign;
5913 int32 aExp;
5914 uint64_t aSig0, aSig1;
5915 uint32_t zSig;
5917 aSig1 = extractFloat128Frac1( a );
5918 aSig0 = extractFloat128Frac0( a );
5919 aExp = extractFloat128Exp( a );
5920 aSign = extractFloat128Sign( a );
5921 if ( aExp == 0x7FFF ) {
5922 if ( aSig0 | aSig1 ) {
5923 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5925 return packFloat32( aSign, 0xFF, 0 );
5927 aSig0 |= ( aSig1 != 0 );
5928 shift64RightJamming( aSig0, 18, &aSig0 );
5929 zSig = aSig0;
5930 if ( aExp || zSig ) {
5931 zSig |= 0x40000000;
5932 aExp -= 0x3F81;
5934 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5938 /*----------------------------------------------------------------------------
5939 | Returns the result of converting the quadruple-precision floating-point
5940 | value `a' to the double-precision floating-point format. The conversion
5941 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5942 | Arithmetic.
5943 *----------------------------------------------------------------------------*/
5945 float64 float128_to_float64(float128 a, float_status *status)
5947 flag aSign;
5948 int32 aExp;
5949 uint64_t aSig0, aSig1;
5951 aSig1 = extractFloat128Frac1( a );
5952 aSig0 = extractFloat128Frac0( a );
5953 aExp = extractFloat128Exp( a );
5954 aSign = extractFloat128Sign( a );
5955 if ( aExp == 0x7FFF ) {
5956 if ( aSig0 | aSig1 ) {
5957 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5959 return packFloat64( aSign, 0x7FF, 0 );
5961 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5962 aSig0 |= ( aSig1 != 0 );
5963 if ( aExp || aSig0 ) {
5964 aSig0 |= LIT64( 0x4000000000000000 );
5965 aExp -= 0x3C01;
5967 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5971 /*----------------------------------------------------------------------------
5972 | Returns the result of converting the quadruple-precision floating-point
5973 | value `a' to the extended double-precision floating-point format. The
5974 | conversion is performed according to the IEC/IEEE Standard for Binary
5975 | Floating-Point Arithmetic.
5976 *----------------------------------------------------------------------------*/
5978 floatx80 float128_to_floatx80(float128 a, float_status *status)
5980 flag aSign;
5981 int32 aExp;
5982 uint64_t aSig0, aSig1;
5984 aSig1 = extractFloat128Frac1( a );
5985 aSig0 = extractFloat128Frac0( a );
5986 aExp = extractFloat128Exp( a );
5987 aSign = extractFloat128Sign( a );
5988 if ( aExp == 0x7FFF ) {
5989 if ( aSig0 | aSig1 ) {
5990 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5992 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5994 if ( aExp == 0 ) {
5995 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5996 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5998 else {
5999 aSig0 |= LIT64( 0x0001000000000000 );
6001 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6002 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
6006 /*----------------------------------------------------------------------------
6007 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6008 | returns the result as a quadruple-precision floating-point value. The
6009 | operation is performed according to the IEC/IEEE Standard for Binary
6010 | Floating-Point Arithmetic.
6011 *----------------------------------------------------------------------------*/
6013 float128 float128_round_to_int(float128 a, float_status *status)
6015 flag aSign;
6016 int32 aExp;
6017 uint64_t lastBitMask, roundBitsMask;
6018 float128 z;
6020 aExp = extractFloat128Exp( a );
6021 if ( 0x402F <= aExp ) {
6022 if ( 0x406F <= aExp ) {
6023 if ( ( aExp == 0x7FFF )
6024 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6026 return propagateFloat128NaN( a, a STATUS_VAR );
6028 return a;
6030 lastBitMask = 1;
6031 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6032 roundBitsMask = lastBitMask - 1;
6033 z = a;
6034 switch (STATUS(float_rounding_mode)) {
6035 case float_round_nearest_even:
6036 if ( lastBitMask ) {
6037 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6038 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6040 else {
6041 if ( (int64_t) z.low < 0 ) {
6042 ++z.high;
6043 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6046 break;
6047 case float_round_ties_away:
6048 if (lastBitMask) {
6049 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6050 } else {
6051 if ((int64_t) z.low < 0) {
6052 ++z.high;
6055 break;
6056 case float_round_to_zero:
6057 break;
6058 case float_round_up:
6059 if (!extractFloat128Sign(z)) {
6060 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6062 break;
6063 case float_round_down:
6064 if (extractFloat128Sign(z)) {
6065 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6067 break;
6068 default:
6069 abort();
6071 z.low &= ~ roundBitsMask;
6073 else {
6074 if ( aExp < 0x3FFF ) {
6075 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6076 STATUS(float_exception_flags) |= float_flag_inexact;
6077 aSign = extractFloat128Sign( a );
6078 switch ( STATUS(float_rounding_mode) ) {
6079 case float_round_nearest_even:
6080 if ( ( aExp == 0x3FFE )
6081 && ( extractFloat128Frac0( a )
6082 | extractFloat128Frac1( a ) )
6084 return packFloat128( aSign, 0x3FFF, 0, 0 );
6086 break;
6087 case float_round_ties_away:
6088 if (aExp == 0x3FFE) {
6089 return packFloat128(aSign, 0x3FFF, 0, 0);
6091 break;
6092 case float_round_down:
6093 return
6094 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6095 : packFloat128( 0, 0, 0, 0 );
6096 case float_round_up:
6097 return
6098 aSign ? packFloat128( 1, 0, 0, 0 )
6099 : packFloat128( 0, 0x3FFF, 0, 0 );
6101 return packFloat128( aSign, 0, 0, 0 );
6103 lastBitMask = 1;
6104 lastBitMask <<= 0x402F - aExp;
6105 roundBitsMask = lastBitMask - 1;
6106 z.low = 0;
6107 z.high = a.high;
6108 switch (STATUS(float_rounding_mode)) {
6109 case float_round_nearest_even:
6110 z.high += lastBitMask>>1;
6111 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6112 z.high &= ~ lastBitMask;
6114 break;
6115 case float_round_ties_away:
6116 z.high += lastBitMask>>1;
6117 break;
6118 case float_round_to_zero:
6119 break;
6120 case float_round_up:
6121 if (!extractFloat128Sign(z)) {
6122 z.high |= ( a.low != 0 );
6123 z.high += roundBitsMask;
6125 break;
6126 case float_round_down:
6127 if (extractFloat128Sign(z)) {
6128 z.high |= (a.low != 0);
6129 z.high += roundBitsMask;
6131 break;
6132 default:
6133 abort();
6135 z.high &= ~ roundBitsMask;
6137 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6138 STATUS(float_exception_flags) |= float_flag_inexact;
6140 return z;
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of adding the absolute values of the quadruple-precision
6146 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6147 | before being returned. `zSign' is ignored if the result is a NaN.
6148 | The addition is performed according to the IEC/IEEE Standard for Binary
6149 | Floating-Point Arithmetic.
6150 *----------------------------------------------------------------------------*/
6152 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6153 float_status *status)
6155 int32 aExp, bExp, zExp;
6156 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6157 int32 expDiff;
6159 aSig1 = extractFloat128Frac1( a );
6160 aSig0 = extractFloat128Frac0( a );
6161 aExp = extractFloat128Exp( a );
6162 bSig1 = extractFloat128Frac1( b );
6163 bSig0 = extractFloat128Frac0( b );
6164 bExp = extractFloat128Exp( b );
6165 expDiff = aExp - bExp;
6166 if ( 0 < expDiff ) {
6167 if ( aExp == 0x7FFF ) {
6168 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6169 return a;
6171 if ( bExp == 0 ) {
6172 --expDiff;
6174 else {
6175 bSig0 |= LIT64( 0x0001000000000000 );
6177 shift128ExtraRightJamming(
6178 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6179 zExp = aExp;
6181 else if ( expDiff < 0 ) {
6182 if ( bExp == 0x7FFF ) {
6183 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6184 return packFloat128( zSign, 0x7FFF, 0, 0 );
6186 if ( aExp == 0 ) {
6187 ++expDiff;
6189 else {
6190 aSig0 |= LIT64( 0x0001000000000000 );
6192 shift128ExtraRightJamming(
6193 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6194 zExp = bExp;
6196 else {
6197 if ( aExp == 0x7FFF ) {
6198 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6199 return propagateFloat128NaN( a, b STATUS_VAR );
6201 return a;
6203 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6204 if ( aExp == 0 ) {
6205 if (STATUS(flush_to_zero)) {
6206 if (zSig0 | zSig1) {
6207 float_raise(float_flag_output_denormal STATUS_VAR);
6209 return packFloat128(zSign, 0, 0, 0);
6211 return packFloat128( zSign, 0, zSig0, zSig1 );
6213 zSig2 = 0;
6214 zSig0 |= LIT64( 0x0002000000000000 );
6215 zExp = aExp;
6216 goto shiftRight1;
6218 aSig0 |= LIT64( 0x0001000000000000 );
6219 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6220 --zExp;
6221 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6222 ++zExp;
6223 shiftRight1:
6224 shift128ExtraRightJamming(
6225 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6226 roundAndPack:
6227 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6231 /*----------------------------------------------------------------------------
6232 | Returns the result of subtracting the absolute values of the quadruple-
6233 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6234 | difference is negated before being returned. `zSign' is ignored if the
6235 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6236 | Standard for Binary Floating-Point Arithmetic.
6237 *----------------------------------------------------------------------------*/
6239 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6240 float_status *status)
6242 int32 aExp, bExp, zExp;
6243 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6244 int32 expDiff;
6245 float128 z;
6247 aSig1 = extractFloat128Frac1( a );
6248 aSig0 = extractFloat128Frac0( a );
6249 aExp = extractFloat128Exp( a );
6250 bSig1 = extractFloat128Frac1( b );
6251 bSig0 = extractFloat128Frac0( b );
6252 bExp = extractFloat128Exp( b );
6253 expDiff = aExp - bExp;
6254 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6255 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6256 if ( 0 < expDiff ) goto aExpBigger;
6257 if ( expDiff < 0 ) goto bExpBigger;
6258 if ( aExp == 0x7FFF ) {
6259 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6260 return propagateFloat128NaN( a, b STATUS_VAR );
6262 float_raise( float_flag_invalid STATUS_VAR);
6263 z.low = float128_default_nan_low;
6264 z.high = float128_default_nan_high;
6265 return z;
6267 if ( aExp == 0 ) {
6268 aExp = 1;
6269 bExp = 1;
6271 if ( bSig0 < aSig0 ) goto aBigger;
6272 if ( aSig0 < bSig0 ) goto bBigger;
6273 if ( bSig1 < aSig1 ) goto aBigger;
6274 if ( aSig1 < bSig1 ) goto bBigger;
6275 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6276 bExpBigger:
6277 if ( bExp == 0x7FFF ) {
6278 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6279 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6281 if ( aExp == 0 ) {
6282 ++expDiff;
6284 else {
6285 aSig0 |= LIT64( 0x4000000000000000 );
6287 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6288 bSig0 |= LIT64( 0x4000000000000000 );
6289 bBigger:
6290 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6291 zExp = bExp;
6292 zSign ^= 1;
6293 goto normalizeRoundAndPack;
6294 aExpBigger:
6295 if ( aExp == 0x7FFF ) {
6296 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6297 return a;
6299 if ( bExp == 0 ) {
6300 --expDiff;
6302 else {
6303 bSig0 |= LIT64( 0x4000000000000000 );
6305 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6306 aSig0 |= LIT64( 0x4000000000000000 );
6307 aBigger:
6308 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6309 zExp = aExp;
6310 normalizeRoundAndPack:
6311 --zExp;
6312 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of adding the quadruple-precision floating-point values
6318 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6319 | for Binary Floating-Point Arithmetic.
6320 *----------------------------------------------------------------------------*/
6322 float128 float128_add(float128 a, float128 b, float_status *status)
6324 flag aSign, bSign;
6326 aSign = extractFloat128Sign( a );
6327 bSign = extractFloat128Sign( b );
6328 if ( aSign == bSign ) {
6329 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6331 else {
6332 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6337 /*----------------------------------------------------------------------------
6338 | Returns the result of subtracting the quadruple-precision floating-point
6339 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6340 | Standard for Binary Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6343 float128 float128_sub(float128 a, float128 b, float_status *status)
6345 flag aSign, bSign;
6347 aSign = extractFloat128Sign( a );
6348 bSign = extractFloat128Sign( b );
6349 if ( aSign == bSign ) {
6350 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6352 else {
6353 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6358 /*----------------------------------------------------------------------------
6359 | Returns the result of multiplying the quadruple-precision floating-point
6360 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6361 | Standard for Binary Floating-Point Arithmetic.
6362 *----------------------------------------------------------------------------*/
6364 float128 float128_mul(float128 a, float128 b, float_status *status)
6366 flag aSign, bSign, zSign;
6367 int32 aExp, bExp, zExp;
6368 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6369 float128 z;
6371 aSig1 = extractFloat128Frac1( a );
6372 aSig0 = extractFloat128Frac0( a );
6373 aExp = extractFloat128Exp( a );
6374 aSign = extractFloat128Sign( a );
6375 bSig1 = extractFloat128Frac1( b );
6376 bSig0 = extractFloat128Frac0( b );
6377 bExp = extractFloat128Exp( b );
6378 bSign = extractFloat128Sign( b );
6379 zSign = aSign ^ bSign;
6380 if ( aExp == 0x7FFF ) {
6381 if ( ( aSig0 | aSig1 )
6382 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6383 return propagateFloat128NaN( a, b STATUS_VAR );
6385 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6386 return packFloat128( zSign, 0x7FFF, 0, 0 );
6388 if ( bExp == 0x7FFF ) {
6389 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6390 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6391 invalid:
6392 float_raise( float_flag_invalid STATUS_VAR);
6393 z.low = float128_default_nan_low;
6394 z.high = float128_default_nan_high;
6395 return z;
6397 return packFloat128( zSign, 0x7FFF, 0, 0 );
6399 if ( aExp == 0 ) {
6400 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6401 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6403 if ( bExp == 0 ) {
6404 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6405 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6407 zExp = aExp + bExp - 0x4000;
6408 aSig0 |= LIT64( 0x0001000000000000 );
6409 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6410 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6411 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6412 zSig2 |= ( zSig3 != 0 );
6413 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6414 shift128ExtraRightJamming(
6415 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6416 ++zExp;
6418 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6422 /*----------------------------------------------------------------------------
6423 | Returns the result of dividing the quadruple-precision floating-point value
6424 | `a' by the corresponding value `b'. The operation is performed according to
6425 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6426 *----------------------------------------------------------------------------*/
6428 float128 float128_div(float128 a, float128 b, float_status *status)
6430 flag aSign, bSign, zSign;
6431 int32 aExp, bExp, zExp;
6432 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6433 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6434 float128 z;
6436 aSig1 = extractFloat128Frac1( a );
6437 aSig0 = extractFloat128Frac0( a );
6438 aExp = extractFloat128Exp( a );
6439 aSign = extractFloat128Sign( a );
6440 bSig1 = extractFloat128Frac1( b );
6441 bSig0 = extractFloat128Frac0( b );
6442 bExp = extractFloat128Exp( b );
6443 bSign = extractFloat128Sign( b );
6444 zSign = aSign ^ bSign;
6445 if ( aExp == 0x7FFF ) {
6446 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6447 if ( bExp == 0x7FFF ) {
6448 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6449 goto invalid;
6451 return packFloat128( zSign, 0x7FFF, 0, 0 );
6453 if ( bExp == 0x7FFF ) {
6454 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6455 return packFloat128( zSign, 0, 0, 0 );
6457 if ( bExp == 0 ) {
6458 if ( ( bSig0 | bSig1 ) == 0 ) {
6459 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6460 invalid:
6461 float_raise( float_flag_invalid STATUS_VAR);
6462 z.low = float128_default_nan_low;
6463 z.high = float128_default_nan_high;
6464 return z;
6466 float_raise( float_flag_divbyzero STATUS_VAR);
6467 return packFloat128( zSign, 0x7FFF, 0, 0 );
6469 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6471 if ( aExp == 0 ) {
6472 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6473 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6475 zExp = aExp - bExp + 0x3FFD;
6476 shortShift128Left(
6477 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6478 shortShift128Left(
6479 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6480 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6481 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6482 ++zExp;
6484 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6485 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6486 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6487 while ( (int64_t) rem0 < 0 ) {
6488 --zSig0;
6489 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6491 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6492 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6493 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6494 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6495 while ( (int64_t) rem1 < 0 ) {
6496 --zSig1;
6497 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6499 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6501 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6502 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6506 /*----------------------------------------------------------------------------
6507 | Returns the remainder of the quadruple-precision floating-point value `a'
6508 | with respect to the corresponding value `b'. The operation is performed
6509 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6510 *----------------------------------------------------------------------------*/
6512 float128 float128_rem(float128 a, float128 b, float_status *status)
6514 flag aSign, zSign;
6515 int32 aExp, bExp, expDiff;
6516 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6517 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6518 int64_t sigMean0;
6519 float128 z;
6521 aSig1 = extractFloat128Frac1( a );
6522 aSig0 = extractFloat128Frac0( a );
6523 aExp = extractFloat128Exp( a );
6524 aSign = extractFloat128Sign( a );
6525 bSig1 = extractFloat128Frac1( b );
6526 bSig0 = extractFloat128Frac0( b );
6527 bExp = extractFloat128Exp( b );
6528 if ( aExp == 0x7FFF ) {
6529 if ( ( aSig0 | aSig1 )
6530 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6531 return propagateFloat128NaN( a, b STATUS_VAR );
6533 goto invalid;
6535 if ( bExp == 0x7FFF ) {
6536 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6537 return a;
6539 if ( bExp == 0 ) {
6540 if ( ( bSig0 | bSig1 ) == 0 ) {
6541 invalid:
6542 float_raise( float_flag_invalid STATUS_VAR);
6543 z.low = float128_default_nan_low;
6544 z.high = float128_default_nan_high;
6545 return z;
6547 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6549 if ( aExp == 0 ) {
6550 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6551 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6553 expDiff = aExp - bExp;
6554 if ( expDiff < -1 ) return a;
6555 shortShift128Left(
6556 aSig0 | LIT64( 0x0001000000000000 ),
6557 aSig1,
6558 15 - ( expDiff < 0 ),
6559 &aSig0,
6560 &aSig1
6562 shortShift128Left(
6563 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6564 q = le128( bSig0, bSig1, aSig0, aSig1 );
6565 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6566 expDiff -= 64;
6567 while ( 0 < expDiff ) {
6568 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6569 q = ( 4 < q ) ? q - 4 : 0;
6570 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6571 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6572 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6573 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6574 expDiff -= 61;
6576 if ( -64 < expDiff ) {
6577 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6578 q = ( 4 < q ) ? q - 4 : 0;
6579 q >>= - expDiff;
6580 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6581 expDiff += 52;
6582 if ( expDiff < 0 ) {
6583 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6585 else {
6586 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6588 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6589 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6591 else {
6592 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6593 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6595 do {
6596 alternateASig0 = aSig0;
6597 alternateASig1 = aSig1;
6598 ++q;
6599 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6600 } while ( 0 <= (int64_t) aSig0 );
6601 add128(
6602 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6603 if ( ( sigMean0 < 0 )
6604 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6605 aSig0 = alternateASig0;
6606 aSig1 = alternateASig1;
6608 zSign = ( (int64_t) aSig0 < 0 );
6609 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6610 return
6611 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6615 /*----------------------------------------------------------------------------
6616 | Returns the square root of the quadruple-precision floating-point value `a'.
6617 | The operation is performed according to the IEC/IEEE Standard for Binary
6618 | Floating-Point Arithmetic.
6619 *----------------------------------------------------------------------------*/
6621 float128 float128_sqrt(float128 a, float_status *status)
6623 flag aSign;
6624 int32 aExp, zExp;
6625 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6626 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6627 float128 z;
6629 aSig1 = extractFloat128Frac1( a );
6630 aSig0 = extractFloat128Frac0( a );
6631 aExp = extractFloat128Exp( a );
6632 aSign = extractFloat128Sign( a );
6633 if ( aExp == 0x7FFF ) {
6634 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6635 if ( ! aSign ) return a;
6636 goto invalid;
6638 if ( aSign ) {
6639 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6640 invalid:
6641 float_raise( float_flag_invalid STATUS_VAR);
6642 z.low = float128_default_nan_low;
6643 z.high = float128_default_nan_high;
6644 return z;
6646 if ( aExp == 0 ) {
6647 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6648 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6650 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6651 aSig0 |= LIT64( 0x0001000000000000 );
6652 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6653 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6654 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6655 doubleZSig0 = zSig0<<1;
6656 mul64To128( zSig0, zSig0, &term0, &term1 );
6657 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6658 while ( (int64_t) rem0 < 0 ) {
6659 --zSig0;
6660 doubleZSig0 -= 2;
6661 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6663 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6664 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6665 if ( zSig1 == 0 ) zSig1 = 1;
6666 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6667 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6668 mul64To128( zSig1, zSig1, &term2, &term3 );
6669 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6670 while ( (int64_t) rem1 < 0 ) {
6671 --zSig1;
6672 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6673 term3 |= 1;
6674 term2 |= doubleZSig0;
6675 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6677 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6679 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6680 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6684 /*----------------------------------------------------------------------------
6685 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6686 | the corresponding value `b', and 0 otherwise. The invalid exception is
6687 | raised if either operand is a NaN. Otherwise, the comparison is performed
6688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6691 int float128_eq(float128 a, float128 b, float_status *status)
6694 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6695 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6696 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6697 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6699 float_raise( float_flag_invalid STATUS_VAR);
6700 return 0;
6702 return
6703 ( a.low == b.low )
6704 && ( ( a.high == b.high )
6705 || ( ( a.low == 0 )
6706 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6711 /*----------------------------------------------------------------------------
6712 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6713 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6714 | exception is raised if either operand is a NaN. The comparison is performed
6715 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6716 *----------------------------------------------------------------------------*/
6718 int float128_le(float128 a, float128 b, float_status *status)
6720 flag aSign, bSign;
6722 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6723 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6724 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6725 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6727 float_raise( float_flag_invalid STATUS_VAR);
6728 return 0;
6730 aSign = extractFloat128Sign( a );
6731 bSign = extractFloat128Sign( b );
6732 if ( aSign != bSign ) {
6733 return
6734 aSign
6735 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6736 == 0 );
6738 return
6739 aSign ? le128( b.high, b.low, a.high, a.low )
6740 : le128( a.high, a.low, b.high, b.low );
6744 /*----------------------------------------------------------------------------
6745 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6746 | the corresponding value `b', and 0 otherwise. The invalid exception is
6747 | raised if either operand is a NaN. The comparison is performed according
6748 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6749 *----------------------------------------------------------------------------*/
6751 int float128_lt(float128 a, float128 b, float_status *status)
6753 flag aSign, bSign;
6755 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6756 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6757 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6758 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6760 float_raise( float_flag_invalid STATUS_VAR);
6761 return 0;
6763 aSign = extractFloat128Sign( a );
6764 bSign = extractFloat128Sign( b );
6765 if ( aSign != bSign ) {
6766 return
6767 aSign
6768 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6769 != 0 );
6771 return
6772 aSign ? lt128( b.high, b.low, a.high, a.low )
6773 : lt128( a.high, a.low, b.high, b.low );
6777 /*----------------------------------------------------------------------------
6778 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6779 | be compared, and 0 otherwise. The invalid exception is raised if either
6780 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6781 | Standard for Binary Floating-Point Arithmetic.
6782 *----------------------------------------------------------------------------*/
6784 int float128_unordered(float128 a, float128 b, float_status *status)
6786 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6787 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6788 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6789 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6791 float_raise( float_flag_invalid STATUS_VAR);
6792 return 1;
6794 return 0;
6797 /*----------------------------------------------------------------------------
6798 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6799 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6800 | exception. The comparison is performed according to the IEC/IEEE Standard
6801 | for Binary Floating-Point Arithmetic.
6802 *----------------------------------------------------------------------------*/
6804 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6807 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6808 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6809 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6810 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6812 if ( float128_is_signaling_nan( a )
6813 || float128_is_signaling_nan( b ) ) {
6814 float_raise( float_flag_invalid STATUS_VAR);
6816 return 0;
6818 return
6819 ( a.low == b.low )
6820 && ( ( a.high == b.high )
6821 || ( ( a.low == 0 )
6822 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6827 /*----------------------------------------------------------------------------
6828 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6829 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6830 | cause an exception. Otherwise, the comparison is performed according to the
6831 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6832 *----------------------------------------------------------------------------*/
6834 int float128_le_quiet(float128 a, float128 b, float_status *status)
6836 flag aSign, bSign;
6838 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6839 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6840 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6841 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6843 if ( float128_is_signaling_nan( a )
6844 || float128_is_signaling_nan( b ) ) {
6845 float_raise( float_flag_invalid STATUS_VAR);
6847 return 0;
6849 aSign = extractFloat128Sign( a );
6850 bSign = extractFloat128Sign( b );
6851 if ( aSign != bSign ) {
6852 return
6853 aSign
6854 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6855 == 0 );
6857 return
6858 aSign ? le128( b.high, b.low, a.high, a.low )
6859 : le128( a.high, a.low, b.high, b.low );
6863 /*----------------------------------------------------------------------------
6864 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6865 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6866 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6867 | Standard for Binary Floating-Point Arithmetic.
6868 *----------------------------------------------------------------------------*/
6870 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6872 flag aSign, bSign;
6874 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6875 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6876 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6877 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6879 if ( float128_is_signaling_nan( a )
6880 || float128_is_signaling_nan( b ) ) {
6881 float_raise( float_flag_invalid STATUS_VAR);
6883 return 0;
6885 aSign = extractFloat128Sign( a );
6886 bSign = extractFloat128Sign( b );
6887 if ( aSign != bSign ) {
6888 return
6889 aSign
6890 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6891 != 0 );
6893 return
6894 aSign ? lt128( b.high, b.low, a.high, a.low )
6895 : lt128( a.high, a.low, b.high, b.low );
6899 /*----------------------------------------------------------------------------
6900 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6901 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6902 | comparison is performed according to the IEC/IEEE Standard for Binary
6903 | Floating-Point Arithmetic.
6904 *----------------------------------------------------------------------------*/
6906 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6908 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6909 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6910 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6911 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6913 if ( float128_is_signaling_nan( a )
6914 || float128_is_signaling_nan( b ) ) {
6915 float_raise( float_flag_invalid STATUS_VAR);
6917 return 1;
6919 return 0;
6922 /* misc functions */
6923 float32 uint32_to_float32(uint32_t a, float_status *status)
6925 return int64_to_float32(a STATUS_VAR);
6928 float64 uint32_to_float64(uint32_t a, float_status *status)
6930 return int64_to_float64(a STATUS_VAR);
6933 uint32 float32_to_uint32(float32 a, float_status *status)
6935 int64_t v;
6936 uint32 res;
6937 int old_exc_flags = get_float_exception_flags(status);
6939 v = float32_to_int64(a STATUS_VAR);
6940 if (v < 0) {
6941 res = 0;
6942 } else if (v > 0xffffffff) {
6943 res = 0xffffffff;
6944 } else {
6945 return v;
6947 set_float_exception_flags(old_exc_flags, status);
6948 float_raise(float_flag_invalid STATUS_VAR);
6949 return res;
6952 uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
6954 int64_t v;
6955 uint32 res;
6956 int old_exc_flags = get_float_exception_flags(status);
6958 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6959 if (v < 0) {
6960 res = 0;
6961 } else if (v > 0xffffffff) {
6962 res = 0xffffffff;
6963 } else {
6964 return v;
6966 set_float_exception_flags(old_exc_flags, status);
6967 float_raise(float_flag_invalid STATUS_VAR);
6968 return res;
6971 int_fast16_t float32_to_int16(float32 a, float_status *status)
6973 int32_t v;
6974 int_fast16_t res;
6975 int old_exc_flags = get_float_exception_flags(status);
6977 v = float32_to_int32(a STATUS_VAR);
6978 if (v < -0x8000) {
6979 res = -0x8000;
6980 } else if (v > 0x7fff) {
6981 res = 0x7fff;
6982 } else {
6983 return v;
6986 set_float_exception_flags(old_exc_flags, status);
6987 float_raise(float_flag_invalid STATUS_VAR);
6988 return res;
6991 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
6993 int32_t v;
6994 uint_fast16_t res;
6995 int old_exc_flags = get_float_exception_flags(status);
6997 v = float32_to_int32(a STATUS_VAR);
6998 if (v < 0) {
6999 res = 0;
7000 } else if (v > 0xffff) {
7001 res = 0xffff;
7002 } else {
7003 return v;
7006 set_float_exception_flags(old_exc_flags, status);
7007 float_raise(float_flag_invalid STATUS_VAR);
7008 return res;
7011 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7013 int64_t v;
7014 uint_fast16_t res;
7015 int old_exc_flags = get_float_exception_flags(status);
7017 v = float32_to_int64_round_to_zero(a STATUS_VAR);
7018 if (v < 0) {
7019 res = 0;
7020 } else if (v > 0xffff) {
7021 res = 0xffff;
7022 } else {
7023 return v;
7025 set_float_exception_flags(old_exc_flags, status);
7026 float_raise(float_flag_invalid STATUS_VAR);
7027 return res;
7030 uint32 float64_to_uint32(float64 a, float_status *status)
7032 uint64_t v;
7033 uint32 res;
7034 int old_exc_flags = get_float_exception_flags(status);
7036 v = float64_to_uint64(a STATUS_VAR);
7037 if (v > 0xffffffff) {
7038 res = 0xffffffff;
7039 } else {
7040 return v;
7042 set_float_exception_flags(old_exc_flags, status);
7043 float_raise(float_flag_invalid STATUS_VAR);
7044 return res;
7047 uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
7049 uint64_t v;
7050 uint32 res;
7051 int old_exc_flags = get_float_exception_flags(status);
7053 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
7054 if (v > 0xffffffff) {
7055 res = 0xffffffff;
7056 } else {
7057 return v;
7059 set_float_exception_flags(old_exc_flags, status);
7060 float_raise(float_flag_invalid STATUS_VAR);
7061 return res;
7064 int_fast16_t float64_to_int16(float64 a, float_status *status)
7066 int64_t v;
7067 int_fast16_t res;
7068 int old_exc_flags = get_float_exception_flags(status);
7070 v = float64_to_int32(a STATUS_VAR);
7071 if (v < -0x8000) {
7072 res = -0x8000;
7073 } else if (v > 0x7fff) {
7074 res = 0x7fff;
7075 } else {
7076 return v;
7079 set_float_exception_flags(old_exc_flags, status);
7080 float_raise(float_flag_invalid STATUS_VAR);
7081 return res;
7084 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7086 int64_t v;
7087 uint_fast16_t res;
7088 int old_exc_flags = get_float_exception_flags(status);
7090 v = float64_to_int32(a STATUS_VAR);
7091 if (v < 0) {
7092 res = 0;
7093 } else if (v > 0xffff) {
7094 res = 0xffff;
7095 } else {
7096 return v;
7099 set_float_exception_flags(old_exc_flags, status);
7100 float_raise(float_flag_invalid STATUS_VAR);
7101 return res;
7104 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7106 int64_t v;
7107 uint_fast16_t res;
7108 int old_exc_flags = get_float_exception_flags(status);
7110 v = float64_to_int64_round_to_zero(a STATUS_VAR);
7111 if (v < 0) {
7112 res = 0;
7113 } else if (v > 0xffff) {
7114 res = 0xffff;
7115 } else {
7116 return v;
7118 set_float_exception_flags(old_exc_flags, status);
7119 float_raise(float_flag_invalid STATUS_VAR);
7120 return res;
7123 /*----------------------------------------------------------------------------
7124 | Returns the result of converting the double-precision floating-point value
7125 | `a' to the 64-bit unsigned integer format. The conversion is
7126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7127 | Arithmetic---which means in particular that the conversion is rounded
7128 | according to the current rounding mode. If `a' is a NaN, the largest
7129 | positive integer is returned. If the conversion overflows, the
7130 | largest unsigned integer is returned. If 'a' is negative, the value is
7131 | rounded and zero is returned; negative values that do not round to zero
7132 | will raise the inexact exception.
7133 *----------------------------------------------------------------------------*/
7135 uint64_t float64_to_uint64(float64 a, float_status *status)
7137 flag aSign;
7138 int_fast16_t aExp, shiftCount;
7139 uint64_t aSig, aSigExtra;
7140 a = float64_squash_input_denormal(a STATUS_VAR);
7142 aSig = extractFloat64Frac(a);
7143 aExp = extractFloat64Exp(a);
7144 aSign = extractFloat64Sign(a);
7145 if (aSign && (aExp > 1022)) {
7146 float_raise(float_flag_invalid STATUS_VAR);
7147 if (float64_is_any_nan(a)) {
7148 return LIT64(0xFFFFFFFFFFFFFFFF);
7149 } else {
7150 return 0;
7153 if (aExp) {
7154 aSig |= LIT64(0x0010000000000000);
7156 shiftCount = 0x433 - aExp;
7157 if (shiftCount <= 0) {
7158 if (0x43E < aExp) {
7159 float_raise(float_flag_invalid STATUS_VAR);
7160 return LIT64(0xFFFFFFFFFFFFFFFF);
7162 aSigExtra = 0;
7163 aSig <<= -shiftCount;
7164 } else {
7165 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7167 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7170 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7172 signed char current_rounding_mode = STATUS(float_rounding_mode);
7173 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7174 int64_t v = float64_to_uint64(a STATUS_VAR);
7175 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7176 return v;
7179 #define COMPARE(s, nan_exp) \
7180 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7181 int is_quiet, float_status *status) \
7183 flag aSign, bSign; \
7184 uint ## s ## _t av, bv; \
7185 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7186 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7188 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7189 extractFloat ## s ## Frac( a ) ) || \
7190 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7191 extractFloat ## s ## Frac( b ) )) { \
7192 if (!is_quiet || \
7193 float ## s ## _is_signaling_nan( a ) || \
7194 float ## s ## _is_signaling_nan( b ) ) { \
7195 float_raise( float_flag_invalid STATUS_VAR); \
7197 return float_relation_unordered; \
7199 aSign = extractFloat ## s ## Sign( a ); \
7200 bSign = extractFloat ## s ## Sign( b ); \
7201 av = float ## s ## _val(a); \
7202 bv = float ## s ## _val(b); \
7203 if ( aSign != bSign ) { \
7204 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7205 /* zero case */ \
7206 return float_relation_equal; \
7207 } else { \
7208 return 1 - (2 * aSign); \
7210 } else { \
7211 if (av == bv) { \
7212 return float_relation_equal; \
7213 } else { \
7214 return 1 - 2 * (aSign ^ ( av < bv )); \
7219 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7221 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7224 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7225 float_status *status) \
7227 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7230 COMPARE(32, 0xff)
7231 COMPARE(64, 0x7ff)
7233 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7234 int is_quiet, float_status *status)
7236 flag aSign, bSign;
7238 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7239 ( extractFloatx80Frac( a )<<1 ) ) ||
7240 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7241 ( extractFloatx80Frac( b )<<1 ) )) {
7242 if (!is_quiet ||
7243 floatx80_is_signaling_nan( a ) ||
7244 floatx80_is_signaling_nan( b ) ) {
7245 float_raise( float_flag_invalid STATUS_VAR);
7247 return float_relation_unordered;
7249 aSign = extractFloatx80Sign( a );
7250 bSign = extractFloatx80Sign( b );
7251 if ( aSign != bSign ) {
7253 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7254 ( ( a.low | b.low ) == 0 ) ) {
7255 /* zero case */
7256 return float_relation_equal;
7257 } else {
7258 return 1 - (2 * aSign);
7260 } else {
7261 if (a.low == b.low && a.high == b.high) {
7262 return float_relation_equal;
7263 } else {
7264 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7269 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7271 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7274 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7276 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7279 static inline int float128_compare_internal(float128 a, float128 b,
7280 int is_quiet, float_status *status)
7282 flag aSign, bSign;
7284 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7285 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7286 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7287 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7288 if (!is_quiet ||
7289 float128_is_signaling_nan( a ) ||
7290 float128_is_signaling_nan( b ) ) {
7291 float_raise( float_flag_invalid STATUS_VAR);
7293 return float_relation_unordered;
7295 aSign = extractFloat128Sign( a );
7296 bSign = extractFloat128Sign( b );
7297 if ( aSign != bSign ) {
7298 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7299 /* zero case */
7300 return float_relation_equal;
7301 } else {
7302 return 1 - (2 * aSign);
7304 } else {
7305 if (a.low == b.low && a.high == b.high) {
7306 return float_relation_equal;
7307 } else {
7308 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7313 int float128_compare(float128 a, float128 b, float_status *status)
7315 return float128_compare_internal(a, b, 0 STATUS_VAR);
7318 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7320 return float128_compare_internal(a, b, 1 STATUS_VAR);
7323 /* min() and max() functions. These can't be implemented as
7324 * 'compare and pick one input' because that would mishandle
7325 * NaNs and +0 vs -0.
7327 * minnum() and maxnum() functions. These are similar to the min()
7328 * and max() functions but if one of the arguments is a QNaN and
7329 * the other is numerical then the numerical argument is returned.
7330 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7331 * and maxNum() operations. min() and max() are the typical min/max
7332 * semantics provided by many CPUs which predate that specification.
7334 * minnummag() and maxnummag() functions correspond to minNumMag()
7335 * and minNumMag() from the IEEE-754 2008.
7337 #define MINMAX(s) \
7338 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7339 int ismin, int isieee, \
7340 int ismag, \
7341 float_status *status) \
7343 flag aSign, bSign; \
7344 uint ## s ## _t av, bv, aav, abv; \
7345 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7346 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7347 if (float ## s ## _is_any_nan(a) || \
7348 float ## s ## _is_any_nan(b)) { \
7349 if (isieee) { \
7350 if (float ## s ## _is_quiet_nan(a) && \
7351 !float ## s ##_is_any_nan(b)) { \
7352 return b; \
7353 } else if (float ## s ## _is_quiet_nan(b) && \
7354 !float ## s ## _is_any_nan(a)) { \
7355 return a; \
7358 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7360 aSign = extractFloat ## s ## Sign(a); \
7361 bSign = extractFloat ## s ## Sign(b); \
7362 av = float ## s ## _val(a); \
7363 bv = float ## s ## _val(b); \
7364 if (ismag) { \
7365 aav = float ## s ## _abs(av); \
7366 abv = float ## s ## _abs(bv); \
7367 if (aav != abv) { \
7368 if (ismin) { \
7369 return (aav < abv) ? a : b; \
7370 } else { \
7371 return (aav < abv) ? b : a; \
7375 if (aSign != bSign) { \
7376 if (ismin) { \
7377 return aSign ? a : b; \
7378 } else { \
7379 return aSign ? b : a; \
7381 } else { \
7382 if (ismin) { \
7383 return (aSign ^ (av < bv)) ? a : b; \
7384 } else { \
7385 return (aSign ^ (av < bv)) ? b : a; \
7390 float ## s float ## s ## _min(float ## s a, float ## s b, \
7391 float_status *status) \
7393 return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR); \
7396 float ## s float ## s ## _max(float ## s a, float ## s b, \
7397 float_status *status) \
7399 return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR); \
7402 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7403 float_status *status) \
7405 return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR); \
7408 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7409 float_status *status) \
7411 return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR); \
7414 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7415 float_status *status) \
7417 return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR); \
7420 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7421 float_status *status) \
7423 return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR); \
7426 MINMAX(32)
7427 MINMAX(64)
7430 /* Multiply A by 2 raised to the power N. */
7431 float32 float32_scalbn(float32 a, int n, float_status *status)
7433 flag aSign;
7434 int16_t aExp;
7435 uint32_t aSig;
7437 a = float32_squash_input_denormal(a STATUS_VAR);
7438 aSig = extractFloat32Frac( a );
7439 aExp = extractFloat32Exp( a );
7440 aSign = extractFloat32Sign( a );
7442 if ( aExp == 0xFF ) {
7443 if ( aSig ) {
7444 return propagateFloat32NaN( a, a STATUS_VAR );
7446 return a;
7448 if (aExp != 0) {
7449 aSig |= 0x00800000;
7450 } else if (aSig == 0) {
7451 return a;
7452 } else {
7453 aExp++;
7456 if (n > 0x200) {
7457 n = 0x200;
7458 } else if (n < -0x200) {
7459 n = -0x200;
7462 aExp += n - 1;
7463 aSig <<= 7;
7464 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7467 float64 float64_scalbn(float64 a, int n, float_status *status)
7469 flag aSign;
7470 int16_t aExp;
7471 uint64_t aSig;
7473 a = float64_squash_input_denormal(a STATUS_VAR);
7474 aSig = extractFloat64Frac( a );
7475 aExp = extractFloat64Exp( a );
7476 aSign = extractFloat64Sign( a );
7478 if ( aExp == 0x7FF ) {
7479 if ( aSig ) {
7480 return propagateFloat64NaN( a, a STATUS_VAR );
7482 return a;
7484 if (aExp != 0) {
7485 aSig |= LIT64( 0x0010000000000000 );
7486 } else if (aSig == 0) {
7487 return a;
7488 } else {
7489 aExp++;
7492 if (n > 0x1000) {
7493 n = 0x1000;
7494 } else if (n < -0x1000) {
7495 n = -0x1000;
7498 aExp += n - 1;
7499 aSig <<= 10;
7500 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7503 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7505 flag aSign;
7506 int32_t aExp;
7507 uint64_t aSig;
7509 aSig = extractFloatx80Frac( a );
7510 aExp = extractFloatx80Exp( a );
7511 aSign = extractFloatx80Sign( a );
7513 if ( aExp == 0x7FFF ) {
7514 if ( aSig<<1 ) {
7515 return propagateFloatx80NaN( a, a STATUS_VAR );
7517 return a;
7520 if (aExp == 0) {
7521 if (aSig == 0) {
7522 return a;
7524 aExp++;
7527 if (n > 0x10000) {
7528 n = 0x10000;
7529 } else if (n < -0x10000) {
7530 n = -0x10000;
7533 aExp += n;
7534 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7535 aSign, aExp, aSig, 0 STATUS_VAR );
7538 float128 float128_scalbn(float128 a, int n, float_status *status)
7540 flag aSign;
7541 int32_t aExp;
7542 uint64_t aSig0, aSig1;
7544 aSig1 = extractFloat128Frac1( a );
7545 aSig0 = extractFloat128Frac0( a );
7546 aExp = extractFloat128Exp( a );
7547 aSign = extractFloat128Sign( a );
7548 if ( aExp == 0x7FFF ) {
7549 if ( aSig0 | aSig1 ) {
7550 return propagateFloat128NaN( a, a STATUS_VAR );
7552 return a;
7554 if (aExp != 0) {
7555 aSig0 |= LIT64( 0x0001000000000000 );
7556 } else if (aSig0 == 0 && aSig1 == 0) {
7557 return a;
7558 } else {
7559 aExp++;
7562 if (n > 0x10000) {
7563 n = 0x10000;
7564 } else if (n < -0x10000) {
7565 n = -0x10000;
7568 aExp += n - 1;
7569 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7570 STATUS_VAR );