4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "softfloat-macros.h"
99 /*----------------------------------------------------------------------------
100 | Functions and definitions to determine: (1) whether tininess for underflow
101 | is detected before or after rounding by default, (2) what (if anything)
102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 | are propagated from function inputs to output. These details are target-
106 *----------------------------------------------------------------------------*/
107 #include "softfloat-specialize.h"
109 /*----------------------------------------------------------------------------
110 | Returns the fraction bits of the half-precision floating-point value `a'.
111 *----------------------------------------------------------------------------*/
113 static inline uint32_t extractFloat16Frac(float16 a
)
115 return float16_val(a
) & 0x3ff;
118 /*----------------------------------------------------------------------------
119 | Returns the exponent bits of the half-precision floating-point value `a'.
120 *----------------------------------------------------------------------------*/
122 static inline int_fast16_t extractFloat16Exp(float16 a
)
124 return (float16_val(a
) >> 10) & 0x1f;
127 /*----------------------------------------------------------------------------
128 | Returns the sign bit of the single-precision floating-point value `a'.
129 *----------------------------------------------------------------------------*/
131 static inline flag
extractFloat16Sign(float16 a
)
133 return float16_val(a
)>>15;
136 /*----------------------------------------------------------------------------
137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
139 | input. If `zSign' is 1, the input is negated before being converted to an
140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141 | is simply rounded to an integer, with the inexact exception raised if the
142 | input cannot be represented exactly as an integer. However, if the fixed-
143 | point input is too large, the invalid exception is raised and the largest
144 | positive or negative integer is returned.
145 *----------------------------------------------------------------------------*/
147 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
150 flag roundNearestEven
;
151 int8 roundIncrement
, roundBits
;
154 roundingMode
= STATUS(float_rounding_mode
);
155 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
156 switch (roundingMode
) {
157 case float_round_nearest_even
:
158 case float_round_ties_away
:
159 roundIncrement
= 0x40;
161 case float_round_to_zero
:
165 roundIncrement
= zSign
? 0 : 0x7f;
167 case float_round_down
:
168 roundIncrement
= zSign
? 0x7f : 0;
173 roundBits
= absZ
& 0x7F;
174 absZ
= ( absZ
+ roundIncrement
)>>7;
175 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
177 if ( zSign
) z
= - z
;
178 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
179 float_raise( float_flag_invalid STATUS_VAR
);
180 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
187 /*----------------------------------------------------------------------------
188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
190 | and returns the properly rounded 64-bit integer corresponding to the input.
191 | If `zSign' is 1, the input is negated before being converted to an integer.
192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
193 | the inexact exception raised if the input cannot be represented exactly as
194 | an integer. However, if the fixed-point input is too large, the invalid
195 | exception is raised and the largest positive or negative integer is
197 *----------------------------------------------------------------------------*/
199 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
202 flag roundNearestEven
, increment
;
205 roundingMode
= STATUS(float_rounding_mode
);
206 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
207 switch (roundingMode
) {
208 case float_round_nearest_even
:
209 case float_round_ties_away
:
210 increment
= ((int64_t) absZ1
< 0);
212 case float_round_to_zero
:
216 increment
= !zSign
&& absZ1
;
218 case float_round_down
:
219 increment
= zSign
&& absZ1
;
226 if ( absZ0
== 0 ) goto overflow
;
227 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
230 if ( zSign
) z
= - z
;
231 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
233 float_raise( float_flag_invalid STATUS_VAR
);
235 zSign
? (int64_t) LIT64( 0x8000000000000000 )
236 : LIT64( 0x7FFFFFFFFFFFFFFF );
238 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
243 /*----------------------------------------------------------------------------
244 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
245 | `absZ1', with binary point between bits 63 and 64 (between the input words),
246 | and returns the properly rounded 64-bit unsigned integer corresponding to the
247 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
248 | with the inexact exception raised if the input cannot be represented exactly
249 | as an integer. However, if the fixed-point input is too large, the invalid
250 | exception is raised and the largest unsigned integer is returned.
251 *----------------------------------------------------------------------------*/
253 static int64
roundAndPackUint64(flag zSign
, uint64_t absZ0
,
254 uint64_t absZ1 STATUS_PARAM
)
257 flag roundNearestEven
, increment
;
259 roundingMode
= STATUS(float_rounding_mode
);
260 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
261 switch (roundingMode
) {
262 case float_round_nearest_even
:
263 case float_round_ties_away
:
264 increment
= ((int64_t)absZ1
< 0);
266 case float_round_to_zero
:
270 increment
= !zSign
&& absZ1
;
272 case float_round_down
:
273 increment
= zSign
&& absZ1
;
281 float_raise(float_flag_invalid STATUS_VAR
);
282 return LIT64(0xFFFFFFFFFFFFFFFF);
284 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
287 if (zSign
&& absZ0
) {
288 float_raise(float_flag_invalid STATUS_VAR
);
293 STATUS(float_exception_flags
) |= float_flag_inexact
;
298 /*----------------------------------------------------------------------------
299 | Returns the fraction bits of the single-precision floating-point value `a'.
300 *----------------------------------------------------------------------------*/
302 static inline uint32_t extractFloat32Frac( float32 a
)
305 return float32_val(a
) & 0x007FFFFF;
309 /*----------------------------------------------------------------------------
310 | Returns the exponent bits of the single-precision floating-point value `a'.
311 *----------------------------------------------------------------------------*/
313 static inline int_fast16_t extractFloat32Exp(float32 a
)
316 return ( float32_val(a
)>>23 ) & 0xFF;
320 /*----------------------------------------------------------------------------
321 | Returns the sign bit of the single-precision floating-point value `a'.
322 *----------------------------------------------------------------------------*/
324 static inline flag
extractFloat32Sign( float32 a
)
327 return float32_val(a
)>>31;
331 /*----------------------------------------------------------------------------
332 | If `a' is denormal and we are in flush-to-zero mode then set the
333 | input-denormal exception and return zero. Otherwise just return the value.
334 *----------------------------------------------------------------------------*/
335 float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
337 if (STATUS(flush_inputs_to_zero
)) {
338 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
339 float_raise(float_flag_input_denormal STATUS_VAR
);
340 return make_float32(float32_val(a
) & 0x80000000);
346 /*----------------------------------------------------------------------------
347 | Normalizes the subnormal single-precision floating-point value represented
348 | by the denormalized significand `aSig'. The normalized exponent and
349 | significand are stored at the locations pointed to by `zExpPtr' and
350 | `zSigPtr', respectively.
351 *----------------------------------------------------------------------------*/
354 normalizeFloat32Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
, uint32_t *zSigPtr
)
358 shiftCount
= countLeadingZeros32( aSig
) - 8;
359 *zSigPtr
= aSig
<<shiftCount
;
360 *zExpPtr
= 1 - shiftCount
;
364 /*----------------------------------------------------------------------------
365 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
366 | single-precision floating-point value, returning the result. After being
367 | shifted into the proper positions, the three fields are simply added
368 | together to form the result. This means that any integer portion of `zSig'
369 | will be added into the exponent. Since a properly normalized significand
370 | will have an integer portion equal to 1, the `zExp' input should be 1 less
371 | than the desired result exponent whenever `zSig' is a complete, normalized
373 *----------------------------------------------------------------------------*/
375 static inline float32
packFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
)
379 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
383 /*----------------------------------------------------------------------------
384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385 | and significand `zSig', and returns the proper single-precision floating-
386 | point value corresponding to the abstract input. Ordinarily, the abstract
387 | value is simply rounded and packed into the single-precision format, with
388 | the inexact exception raised if the abstract input cannot be represented
389 | exactly. However, if the abstract value is too large, the overflow and
390 | inexact exceptions are raised and an infinity or maximal finite value is
391 | returned. If the abstract value is too small, the input value is rounded to
392 | a subnormal number, and the underflow and inexact exceptions are raised if
393 | the abstract input cannot be represented exactly as a subnormal single-
394 | precision floating-point number.
395 | The input significand `zSig' has its binary point between bits 30
396 | and 29, which is 7 bits to the left of the usual location. This shifted
397 | significand must be normalized or smaller. If `zSig' is not normalized,
398 | `zExp' must be 0; in that case, the result returned is a subnormal number,
399 | and it must not require rounding. In the usual case that `zSig' is
400 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
401 | The handling of underflow and overflow follows the IEC/IEEE Standard for
402 | Binary Floating-Point Arithmetic.
403 *----------------------------------------------------------------------------*/
405 static float32
roundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
408 flag roundNearestEven
;
409 int8 roundIncrement
, roundBits
;
412 roundingMode
= STATUS(float_rounding_mode
);
413 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
414 switch (roundingMode
) {
415 case float_round_nearest_even
:
416 case float_round_ties_away
:
417 roundIncrement
= 0x40;
419 case float_round_to_zero
:
423 roundIncrement
= zSign
? 0 : 0x7f;
425 case float_round_down
:
426 roundIncrement
= zSign
? 0x7f : 0;
432 roundBits
= zSig
& 0x7F;
433 if ( 0xFD <= (uint16_t) zExp
) {
435 || ( ( zExp
== 0xFD )
436 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
438 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
439 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
442 if (STATUS(flush_to_zero
)) {
443 float_raise(float_flag_output_denormal STATUS_VAR
);
444 return packFloat32(zSign
, 0, 0);
447 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
449 || ( zSig
+ roundIncrement
< 0x80000000 );
450 shift32RightJamming( zSig
, - zExp
, &zSig
);
452 roundBits
= zSig
& 0x7F;
453 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
456 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
457 zSig
= ( zSig
+ roundIncrement
)>>7;
458 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
459 if ( zSig
== 0 ) zExp
= 0;
460 return packFloat32( zSign
, zExp
, zSig
);
464 /*----------------------------------------------------------------------------
465 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
466 | and significand `zSig', and returns the proper single-precision floating-
467 | point value corresponding to the abstract input. This routine is just like
468 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
469 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
470 | floating-point exponent.
471 *----------------------------------------------------------------------------*/
474 normalizeRoundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
478 shiftCount
= countLeadingZeros32( zSig
) - 1;
479 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
483 /*----------------------------------------------------------------------------
484 | Returns the fraction bits of the double-precision floating-point value `a'.
485 *----------------------------------------------------------------------------*/
487 static inline uint64_t extractFloat64Frac( float64 a
)
490 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
494 /*----------------------------------------------------------------------------
495 | Returns the exponent bits of the double-precision floating-point value `a'.
496 *----------------------------------------------------------------------------*/
498 static inline int_fast16_t extractFloat64Exp(float64 a
)
501 return ( float64_val(a
)>>52 ) & 0x7FF;
505 /*----------------------------------------------------------------------------
506 | Returns the sign bit of the double-precision floating-point value `a'.
507 *----------------------------------------------------------------------------*/
509 static inline flag
extractFloat64Sign( float64 a
)
512 return float64_val(a
)>>63;
516 /*----------------------------------------------------------------------------
517 | If `a' is denormal and we are in flush-to-zero mode then set the
518 | input-denormal exception and return zero. Otherwise just return the value.
519 *----------------------------------------------------------------------------*/
520 float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
522 if (STATUS(flush_inputs_to_zero
)) {
523 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
524 float_raise(float_flag_input_denormal STATUS_VAR
);
525 return make_float64(float64_val(a
) & (1ULL << 63));
531 /*----------------------------------------------------------------------------
532 | Normalizes the subnormal double-precision floating-point value represented
533 | by the denormalized significand `aSig'. The normalized exponent and
534 | significand are stored at the locations pointed to by `zExpPtr' and
535 | `zSigPtr', respectively.
536 *----------------------------------------------------------------------------*/
539 normalizeFloat64Subnormal(uint64_t aSig
, int_fast16_t *zExpPtr
, uint64_t *zSigPtr
)
543 shiftCount
= countLeadingZeros64( aSig
) - 11;
544 *zSigPtr
= aSig
<<shiftCount
;
545 *zExpPtr
= 1 - shiftCount
;
549 /*----------------------------------------------------------------------------
550 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
551 | double-precision floating-point value, returning the result. After being
552 | shifted into the proper positions, the three fields are simply added
553 | together to form the result. This means that any integer portion of `zSig'
554 | will be added into the exponent. Since a properly normalized significand
555 | will have an integer portion equal to 1, the `zExp' input should be 1 less
556 | than the desired result exponent whenever `zSig' is a complete, normalized
558 *----------------------------------------------------------------------------*/
560 static inline float64
packFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
)
564 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
568 /*----------------------------------------------------------------------------
569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570 | and significand `zSig', and returns the proper double-precision floating-
571 | point value corresponding to the abstract input. Ordinarily, the abstract
572 | value is simply rounded and packed into the double-precision format, with
573 | the inexact exception raised if the abstract input cannot be represented
574 | exactly. However, if the abstract value is too large, the overflow and
575 | inexact exceptions are raised and an infinity or maximal finite value is
576 | returned. If the abstract value is too small, the input value is rounded to
577 | a subnormal number, and the underflow and inexact exceptions are raised if
578 | the abstract input cannot be represented exactly as a subnormal double-
579 | precision floating-point number.
580 | The input significand `zSig' has its binary point between bits 62
581 | and 61, which is 10 bits to the left of the usual location. This shifted
582 | significand must be normalized or smaller. If `zSig' is not normalized,
583 | `zExp' must be 0; in that case, the result returned is a subnormal number,
584 | and it must not require rounding. In the usual case that `zSig' is
585 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
586 | The handling of underflow and overflow follows the IEC/IEEE Standard for
587 | Binary Floating-Point Arithmetic.
588 *----------------------------------------------------------------------------*/
590 static float64
roundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
593 flag roundNearestEven
;
594 int_fast16_t roundIncrement
, roundBits
;
597 roundingMode
= STATUS(float_rounding_mode
);
598 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
599 switch (roundingMode
) {
600 case float_round_nearest_even
:
601 case float_round_ties_away
:
602 roundIncrement
= 0x200;
604 case float_round_to_zero
:
608 roundIncrement
= zSign
? 0 : 0x3ff;
610 case float_round_down
:
611 roundIncrement
= zSign
? 0x3ff : 0;
616 roundBits
= zSig
& 0x3FF;
617 if ( 0x7FD <= (uint16_t) zExp
) {
618 if ( ( 0x7FD < zExp
)
619 || ( ( zExp
== 0x7FD )
620 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
622 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
623 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
626 if (STATUS(flush_to_zero
)) {
627 float_raise(float_flag_output_denormal STATUS_VAR
);
628 return packFloat64(zSign
, 0, 0);
631 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
633 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
634 shift64RightJamming( zSig
, - zExp
, &zSig
);
636 roundBits
= zSig
& 0x3FF;
637 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
640 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
641 zSig
= ( zSig
+ roundIncrement
)>>10;
642 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
643 if ( zSig
== 0 ) zExp
= 0;
644 return packFloat64( zSign
, zExp
, zSig
);
648 /*----------------------------------------------------------------------------
649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
650 | and significand `zSig', and returns the proper double-precision floating-
651 | point value corresponding to the abstract input. This routine is just like
652 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
653 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
654 | floating-point exponent.
655 *----------------------------------------------------------------------------*/
658 normalizeRoundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
662 shiftCount
= countLeadingZeros64( zSig
) - 1;
663 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
667 /*----------------------------------------------------------------------------
668 | Returns the fraction bits of the extended double-precision floating-point
670 *----------------------------------------------------------------------------*/
672 static inline uint64_t extractFloatx80Frac( floatx80 a
)
679 /*----------------------------------------------------------------------------
680 | Returns the exponent bits of the extended double-precision floating-point
682 *----------------------------------------------------------------------------*/
684 static inline int32
extractFloatx80Exp( floatx80 a
)
687 return a
.high
& 0x7FFF;
691 /*----------------------------------------------------------------------------
692 | Returns the sign bit of the extended double-precision floating-point value
694 *----------------------------------------------------------------------------*/
696 static inline flag
extractFloatx80Sign( floatx80 a
)
703 /*----------------------------------------------------------------------------
704 | Normalizes the subnormal extended double-precision floating-point value
705 | represented by the denormalized significand `aSig'. The normalized exponent
706 | and significand are stored at the locations pointed to by `zExpPtr' and
707 | `zSigPtr', respectively.
708 *----------------------------------------------------------------------------*/
711 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
715 shiftCount
= countLeadingZeros64( aSig
);
716 *zSigPtr
= aSig
<<shiftCount
;
717 *zExpPtr
= 1 - shiftCount
;
721 /*----------------------------------------------------------------------------
722 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
723 | extended double-precision floating-point value, returning the result.
724 *----------------------------------------------------------------------------*/
726 static inline floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
731 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
736 /*----------------------------------------------------------------------------
737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
738 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
739 | and returns the proper extended double-precision floating-point value
740 | corresponding to the abstract input. Ordinarily, the abstract value is
741 | rounded and packed into the extended double-precision format, with the
742 | inexact exception raised if the abstract input cannot be represented
743 | exactly. However, if the abstract value is too large, the overflow and
744 | inexact exceptions are raised and an infinity or maximal finite value is
745 | returned. If the abstract value is too small, the input value is rounded to
746 | a subnormal number, and the underflow and inexact exceptions are raised if
747 | the abstract input cannot be represented exactly as a subnormal extended
748 | double-precision floating-point number.
749 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
750 | number of bits as single or double precision, respectively. Otherwise, the
751 | result is rounded to the full precision of the extended double-precision
753 | The input significand must be normalized or smaller. If the input
754 | significand is not normalized, `zExp' must be 0; in that case, the result
755 | returned is a subnormal number, and it must not require rounding. The
756 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
757 | Floating-Point Arithmetic.
758 *----------------------------------------------------------------------------*/
761 roundAndPackFloatx80(
762 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
766 flag roundNearestEven
, increment
, isTiny
;
767 int64 roundIncrement
, roundMask
, roundBits
;
769 roundingMode
= STATUS(float_rounding_mode
);
770 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
771 if ( roundingPrecision
== 80 ) goto precision80
;
772 if ( roundingPrecision
== 64 ) {
773 roundIncrement
= LIT64( 0x0000000000000400 );
774 roundMask
= LIT64( 0x00000000000007FF );
776 else if ( roundingPrecision
== 32 ) {
777 roundIncrement
= LIT64( 0x0000008000000000 );
778 roundMask
= LIT64( 0x000000FFFFFFFFFF );
783 zSig0
|= ( zSig1
!= 0 );
784 switch (roundingMode
) {
785 case float_round_nearest_even
:
786 case float_round_ties_away
:
788 case float_round_to_zero
:
792 roundIncrement
= zSign
? 0 : roundMask
;
794 case float_round_down
:
795 roundIncrement
= zSign
? roundMask
: 0;
800 roundBits
= zSig0
& roundMask
;
801 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
802 if ( ( 0x7FFE < zExp
)
803 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
808 if (STATUS(flush_to_zero
)) {
809 float_raise(float_flag_output_denormal STATUS_VAR
);
810 return packFloatx80(zSign
, 0, 0);
813 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
815 || ( zSig0
<= zSig0
+ roundIncrement
);
816 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
818 roundBits
= zSig0
& roundMask
;
819 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
820 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
821 zSig0
+= roundIncrement
;
822 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
823 roundIncrement
= roundMask
+ 1;
824 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
825 roundMask
|= roundIncrement
;
827 zSig0
&= ~ roundMask
;
828 return packFloatx80( zSign
, zExp
, zSig0
);
831 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
832 zSig0
+= roundIncrement
;
833 if ( zSig0
< roundIncrement
) {
835 zSig0
= LIT64( 0x8000000000000000 );
837 roundIncrement
= roundMask
+ 1;
838 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
839 roundMask
|= roundIncrement
;
841 zSig0
&= ~ roundMask
;
842 if ( zSig0
== 0 ) zExp
= 0;
843 return packFloatx80( zSign
, zExp
, zSig0
);
845 switch (roundingMode
) {
846 case float_round_nearest_even
:
847 case float_round_ties_away
:
848 increment
= ((int64_t)zSig1
< 0);
850 case float_round_to_zero
:
854 increment
= !zSign
&& zSig1
;
856 case float_round_down
:
857 increment
= zSign
&& zSig1
;
862 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
863 if ( ( 0x7FFE < zExp
)
864 || ( ( zExp
== 0x7FFE )
865 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
871 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
872 if ( ( roundingMode
== float_round_to_zero
)
873 || ( zSign
&& ( roundingMode
== float_round_up
) )
874 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
876 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
878 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
882 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
885 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
886 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
888 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
889 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
890 switch (roundingMode
) {
891 case float_round_nearest_even
:
892 case float_round_ties_away
:
893 increment
= ((int64_t)zSig1
< 0);
895 case float_round_to_zero
:
899 increment
= !zSign
&& zSig1
;
901 case float_round_down
:
902 increment
= zSign
&& zSig1
;
910 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
911 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
913 return packFloatx80( zSign
, zExp
, zSig0
);
916 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
921 zSig0
= LIT64( 0x8000000000000000 );
924 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
928 if ( zSig0
== 0 ) zExp
= 0;
930 return packFloatx80( zSign
, zExp
, zSig0
);
934 /*----------------------------------------------------------------------------
935 | Takes an abstract floating-point value having sign `zSign', exponent
936 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
937 | and returns the proper extended double-precision floating-point value
938 | corresponding to the abstract input. This routine is just like
939 | `roundAndPackFloatx80' except that the input significand does not have to be
941 *----------------------------------------------------------------------------*/
944 normalizeRoundAndPackFloatx80(
945 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
955 shiftCount
= countLeadingZeros64( zSig0
);
956 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
959 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
963 /*----------------------------------------------------------------------------
964 | Returns the least-significant 64 fraction bits of the quadruple-precision
965 | floating-point value `a'.
966 *----------------------------------------------------------------------------*/
968 static inline uint64_t extractFloat128Frac1( float128 a
)
975 /*----------------------------------------------------------------------------
976 | Returns the most-significant 48 fraction bits of the quadruple-precision
977 | floating-point value `a'.
978 *----------------------------------------------------------------------------*/
980 static inline uint64_t extractFloat128Frac0( float128 a
)
983 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
987 /*----------------------------------------------------------------------------
988 | Returns the exponent bits of the quadruple-precision floating-point value
990 *----------------------------------------------------------------------------*/
992 static inline int32
extractFloat128Exp( float128 a
)
995 return ( a
.high
>>48 ) & 0x7FFF;
999 /*----------------------------------------------------------------------------
1000 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1001 *----------------------------------------------------------------------------*/
1003 static inline flag
extractFloat128Sign( float128 a
)
1010 /*----------------------------------------------------------------------------
1011 | Normalizes the subnormal quadruple-precision floating-point value
1012 | represented by the denormalized significand formed by the concatenation of
1013 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1014 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1015 | significand are stored at the location pointed to by `zSig0Ptr', and the
1016 | least significant 64 bits of the normalized significand are stored at the
1017 | location pointed to by `zSig1Ptr'.
1018 *----------------------------------------------------------------------------*/
1021 normalizeFloat128Subnormal(
1032 shiftCount
= countLeadingZeros64( aSig1
) - 15;
1033 if ( shiftCount
< 0 ) {
1034 *zSig0Ptr
= aSig1
>>( - shiftCount
);
1035 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
1038 *zSig0Ptr
= aSig1
<<shiftCount
;
1041 *zExpPtr
= - shiftCount
- 63;
1044 shiftCount
= countLeadingZeros64( aSig0
) - 15;
1045 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
1046 *zExpPtr
= 1 - shiftCount
;
1051 /*----------------------------------------------------------------------------
1052 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1053 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1054 | floating-point value, returning the result. After being shifted into the
1055 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1056 | added together to form the most significant 32 bits of the result. This
1057 | means that any integer portion of `zSig0' will be added into the exponent.
1058 | Since a properly normalized significand will have an integer portion equal
1059 | to 1, the `zExp' input should be 1 less than the desired result exponent
1060 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1062 *----------------------------------------------------------------------------*/
1064 static inline float128
1065 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
1070 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1075 /*----------------------------------------------------------------------------
1076 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1078 | and `zSig2', and returns the proper quadruple-precision floating-point value
1079 | corresponding to the abstract input. Ordinarily, the abstract value is
1080 | simply rounded and packed into the quadruple-precision format, with the
1081 | inexact exception raised if the abstract input cannot be represented
1082 | exactly. However, if the abstract value is too large, the overflow and
1083 | inexact exceptions are raised and an infinity or maximal finite value is
1084 | returned. If the abstract value is too small, the input value is rounded to
1085 | a subnormal number, and the underflow and inexact exceptions are raised if
1086 | the abstract input cannot be represented exactly as a subnormal quadruple-
1087 | precision floating-point number.
1088 | The input significand must be normalized or smaller. If the input
1089 | significand is not normalized, `zExp' must be 0; in that case, the result
1090 | returned is a subnormal number, and it must not require rounding. In the
1091 | usual case that the input significand is normalized, `zExp' must be 1 less
1092 | than the ``true'' floating-point exponent. The handling of underflow and
1093 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1094 *----------------------------------------------------------------------------*/
1097 roundAndPackFloat128(
1098 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
1101 flag roundNearestEven
, increment
, isTiny
;
1103 roundingMode
= STATUS(float_rounding_mode
);
1104 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1105 switch (roundingMode
) {
1106 case float_round_nearest_even
:
1107 case float_round_ties_away
:
1108 increment
= ((int64_t)zSig2
< 0);
1110 case float_round_to_zero
:
1113 case float_round_up
:
1114 increment
= !zSign
&& zSig2
;
1116 case float_round_down
:
1117 increment
= zSign
&& zSig2
;
1122 if ( 0x7FFD <= (uint32_t) zExp
) {
1123 if ( ( 0x7FFD < zExp
)
1124 || ( ( zExp
== 0x7FFD )
1126 LIT64( 0x0001FFFFFFFFFFFF ),
1127 LIT64( 0xFFFFFFFFFFFFFFFF ),
1134 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1135 if ( ( roundingMode
== float_round_to_zero
)
1136 || ( zSign
&& ( roundingMode
== float_round_up
) )
1137 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1143 LIT64( 0x0000FFFFFFFFFFFF ),
1144 LIT64( 0xFFFFFFFFFFFFFFFF )
1147 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1150 if (STATUS(flush_to_zero
)) {
1151 float_raise(float_flag_output_denormal STATUS_VAR
);
1152 return packFloat128(zSign
, 0, 0, 0);
1155 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1161 LIT64( 0x0001FFFFFFFFFFFF ),
1162 LIT64( 0xFFFFFFFFFFFFFFFF )
1164 shift128ExtraRightJamming(
1165 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1167 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1168 switch (roundingMode
) {
1169 case float_round_nearest_even
:
1170 case float_round_ties_away
:
1171 increment
= ((int64_t)zSig2
< 0);
1173 case float_round_to_zero
:
1176 case float_round_up
:
1177 increment
= !zSign
&& zSig2
;
1179 case float_round_down
:
1180 increment
= zSign
&& zSig2
;
1187 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1189 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1190 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1193 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1195 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1199 /*----------------------------------------------------------------------------
1200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1201 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1202 | returns the proper quadruple-precision floating-point value corresponding
1203 | to the abstract input. This routine is just like `roundAndPackFloat128'
1204 | except that the input significand has fewer bits and does not have to be
1205 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1207 *----------------------------------------------------------------------------*/
1210 normalizeRoundAndPackFloat128(
1211 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1221 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1222 if ( 0 <= shiftCount
) {
1224 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1227 shift128ExtraRightJamming(
1228 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1231 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1235 /*----------------------------------------------------------------------------
1236 | Returns the result of converting the 32-bit two's complement integer `a'
1237 | to the single-precision floating-point format. The conversion is performed
1238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1239 *----------------------------------------------------------------------------*/
1241 float32
int32_to_float32(int32_t a STATUS_PARAM
)
1245 if ( a
== 0 ) return float32_zero
;
1246 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1248 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 32-bit two's complement integer `a'
1254 | to the double-precision floating-point format. The conversion is performed
1255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 *----------------------------------------------------------------------------*/
1258 float64
int32_to_float64(int32_t a STATUS_PARAM
)
1265 if ( a
== 0 ) return float64_zero
;
1267 absA
= zSign
? - a
: a
;
1268 shiftCount
= countLeadingZeros32( absA
) + 21;
1270 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the extended double-precision floating-point format. The conversion
1277 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1279 *----------------------------------------------------------------------------*/
1281 floatx80
int32_to_floatx80(int32_t a STATUS_PARAM
)
1288 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1290 absA
= zSign
? - a
: a
;
1291 shiftCount
= countLeadingZeros32( absA
) + 32;
1293 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 32-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format. The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1303 float128
int32_to_float128(int32_t a STATUS_PARAM
)
1310 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1312 absA
= zSign
? - a
: a
;
1313 shiftCount
= countLeadingZeros32( absA
) + 17;
1315 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1319 /*----------------------------------------------------------------------------
1320 | Returns the result of converting the 64-bit two's complement integer `a'
1321 | to the single-precision floating-point format. The conversion is performed
1322 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1323 *----------------------------------------------------------------------------*/
1325 float32
int64_to_float32(int64_t a STATUS_PARAM
)
1331 if ( a
== 0 ) return float32_zero
;
1333 absA
= zSign
? - a
: a
;
1334 shiftCount
= countLeadingZeros64( absA
) - 40;
1335 if ( 0 <= shiftCount
) {
1336 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1340 if ( shiftCount
< 0 ) {
1341 shift64RightJamming( absA
, - shiftCount
, &absA
);
1344 absA
<<= shiftCount
;
1346 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the 64-bit two's complement integer `a'
1353 | to the double-precision floating-point format. The conversion is performed
1354 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1355 *----------------------------------------------------------------------------*/
1357 float64
int64_to_float64(int64_t a STATUS_PARAM
)
1361 if ( a
== 0 ) return float64_zero
;
1362 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1363 return packFloat64( 1, 0x43E, 0 );
1366 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the 64-bit two's complement integer `a'
1372 | to the extended double-precision floating-point format. The conversion
1373 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1375 *----------------------------------------------------------------------------*/
1377 floatx80
int64_to_floatx80(int64_t a STATUS_PARAM
)
1383 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1385 absA
= zSign
? - a
: a
;
1386 shiftCount
= countLeadingZeros64( absA
);
1387 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1391 /*----------------------------------------------------------------------------
1392 | Returns the result of converting the 64-bit two's complement integer `a' to
1393 | the quadruple-precision floating-point format. The conversion is performed
1394 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1395 *----------------------------------------------------------------------------*/
1397 float128
int64_to_float128(int64_t a STATUS_PARAM
)
1403 uint64_t zSig0
, zSig1
;
1405 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1407 absA
= zSign
? - a
: a
;
1408 shiftCount
= countLeadingZeros64( absA
) + 49;
1409 zExp
= 0x406E - shiftCount
;
1410 if ( 64 <= shiftCount
) {
1419 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1420 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1424 /*----------------------------------------------------------------------------
1425 | Returns the result of converting the 64-bit unsigned integer `a'
1426 | to the single-precision floating-point format. The conversion is performed
1427 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1428 *----------------------------------------------------------------------------*/
1430 float32
uint64_to_float32(uint64_t a STATUS_PARAM
)
1435 return float32_zero
;
1438 /* Determine (left) shift needed to put first set bit into bit posn 23
1439 * (since packFloat32() expects the binary point between bits 23 and 22);
1440 * this is the fast case for smallish numbers.
1442 shiftcount
= countLeadingZeros64(a
) - 40;
1443 if (shiftcount
>= 0) {
1444 return packFloat32(0, 0x95 - shiftcount
, a
<< shiftcount
);
1446 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1447 * expects the binary point between bits 30 and 29, hence the + 7.
1450 if (shiftcount
< 0) {
1451 shift64RightJamming(a
, -shiftcount
, &a
);
1456 return roundAndPackFloat32(0, 0x9c - shiftcount
, a STATUS_VAR
);
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the 64-bit unsigned integer `a'
1461 | to the double-precision floating-point format. The conversion is performed
1462 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1463 *----------------------------------------------------------------------------*/
1465 float64
uint64_to_float64(uint64_t a STATUS_PARAM
)
1471 return float64_zero
;
1474 shiftcount
= countLeadingZeros64(a
) - 1;
1475 if (shiftcount
< 0) {
1476 shift64RightJamming(a
, -shiftcount
, &a
);
1480 return roundAndPackFloat64(0, exp
- shiftcount
, a STATUS_VAR
);
1483 /*----------------------------------------------------------------------------
1484 | Returns the result of converting the 64-bit unsigned integer `a'
1485 | to the quadruple-precision floating-point format. The conversion is performed
1486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1487 *----------------------------------------------------------------------------*/
1489 float128
uint64_to_float128(uint64_t a STATUS_PARAM
)
1492 return float128_zero
;
1494 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0 STATUS_VAR
);
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 32-bit two's complement integer format. The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic---which means in particular that the conversion is rounded
1502 | according to the current rounding mode. If `a' is a NaN, the largest
1503 | positive integer is returned. Otherwise, if the conversion overflows, the
1504 | largest integer with the same sign as `a' is returned.
1505 *----------------------------------------------------------------------------*/
1507 int32
float32_to_int32( float32 a STATUS_PARAM
)
1510 int_fast16_t aExp
, shiftCount
;
1514 a
= float32_squash_input_denormal(a STATUS_VAR
);
1515 aSig
= extractFloat32Frac( a
);
1516 aExp
= extractFloat32Exp( a
);
1517 aSign
= extractFloat32Sign( a
);
1518 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1519 if ( aExp
) aSig
|= 0x00800000;
1520 shiftCount
= 0xAF - aExp
;
1523 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1524 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1528 /*----------------------------------------------------------------------------
1529 | Returns the result of converting the single-precision floating-point value
1530 | `a' to the 32-bit two's complement integer format. The conversion is
1531 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1532 | Arithmetic, except that the conversion is always rounded toward zero.
1533 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1534 | the conversion overflows, the largest integer with the same sign as `a' is
1536 *----------------------------------------------------------------------------*/
1538 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1541 int_fast16_t aExp
, shiftCount
;
1544 a
= float32_squash_input_denormal(a STATUS_VAR
);
1546 aSig
= extractFloat32Frac( a
);
1547 aExp
= extractFloat32Exp( a
);
1548 aSign
= extractFloat32Sign( a
);
1549 shiftCount
= aExp
- 0x9E;
1550 if ( 0 <= shiftCount
) {
1551 if ( float32_val(a
) != 0xCF000000 ) {
1552 float_raise( float_flag_invalid STATUS_VAR
);
1553 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1555 return (int32_t) 0x80000000;
1557 else if ( aExp
<= 0x7E ) {
1558 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1561 aSig
= ( aSig
| 0x00800000 )<<8;
1562 z
= aSig
>>( - shiftCount
);
1563 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1564 STATUS(float_exception_flags
) |= float_flag_inexact
;
1566 if ( aSign
) z
= - z
;
1571 /*----------------------------------------------------------------------------
1572 | Returns the result of converting the single-precision floating-point value
1573 | `a' to the 16-bit two's complement integer format. The conversion is
1574 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1575 | Arithmetic, except that the conversion is always rounded toward zero.
1576 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1577 | the conversion overflows, the largest integer with the same sign as `a' is
1579 *----------------------------------------------------------------------------*/
1581 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM
)
1584 int_fast16_t aExp
, shiftCount
;
1588 aSig
= extractFloat32Frac( a
);
1589 aExp
= extractFloat32Exp( a
);
1590 aSign
= extractFloat32Sign( a
);
1591 shiftCount
= aExp
- 0x8E;
1592 if ( 0 <= shiftCount
) {
1593 if ( float32_val(a
) != 0xC7000000 ) {
1594 float_raise( float_flag_invalid STATUS_VAR
);
1595 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1599 return (int32_t) 0xffff8000;
1601 else if ( aExp
<= 0x7E ) {
1602 if ( aExp
| aSig
) {
1603 STATUS(float_exception_flags
) |= float_flag_inexact
;
1608 aSig
= ( aSig
| 0x00800000 )<<8;
1609 z
= aSig
>>( - shiftCount
);
1610 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1611 STATUS(float_exception_flags
) |= float_flag_inexact
;
1620 /*----------------------------------------------------------------------------
1621 | Returns the result of converting the single-precision floating-point value
1622 | `a' to the 64-bit two's complement integer format. The conversion is
1623 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1624 | Arithmetic---which means in particular that the conversion is rounded
1625 | according to the current rounding mode. If `a' is a NaN, the largest
1626 | positive integer is returned. Otherwise, if the conversion overflows, the
1627 | largest integer with the same sign as `a' is returned.
1628 *----------------------------------------------------------------------------*/
1630 int64
float32_to_int64( float32 a STATUS_PARAM
)
1633 int_fast16_t aExp
, shiftCount
;
1635 uint64_t aSig64
, aSigExtra
;
1636 a
= float32_squash_input_denormal(a STATUS_VAR
);
1638 aSig
= extractFloat32Frac( a
);
1639 aExp
= extractFloat32Exp( a
);
1640 aSign
= extractFloat32Sign( a
);
1641 shiftCount
= 0xBE - aExp
;
1642 if ( shiftCount
< 0 ) {
1643 float_raise( float_flag_invalid STATUS_VAR
);
1644 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1645 return LIT64( 0x7FFFFFFFFFFFFFFF );
1647 return (int64_t) LIT64( 0x8000000000000000 );
1649 if ( aExp
) aSig
|= 0x00800000;
1652 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1653 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of converting the single-precision floating-point value
1659 | `a' to the 64-bit unsigned integer format. The conversion is
1660 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1661 | Arithmetic---which means in particular that the conversion is rounded
1662 | according to the current rounding mode. If `a' is a NaN, the largest
1663 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1664 | largest unsigned integer is returned. If the 'a' is negative, the result
1665 | is rounded and zero is returned; values that do not round to zero will
1666 | raise the inexact exception flag.
1667 *----------------------------------------------------------------------------*/
1669 uint64
float32_to_uint64(float32 a STATUS_PARAM
)
1672 int_fast16_t aExp
, shiftCount
;
1674 uint64_t aSig64
, aSigExtra
;
1675 a
= float32_squash_input_denormal(a STATUS_VAR
);
1677 aSig
= extractFloat32Frac(a
);
1678 aExp
= extractFloat32Exp(a
);
1679 aSign
= extractFloat32Sign(a
);
1680 if ((aSign
) && (aExp
> 126)) {
1681 float_raise(float_flag_invalid STATUS_VAR
);
1682 if (float32_is_any_nan(a
)) {
1683 return LIT64(0xFFFFFFFFFFFFFFFF);
1688 shiftCount
= 0xBE - aExp
;
1692 if (shiftCount
< 0) {
1693 float_raise(float_flag_invalid STATUS_VAR
);
1694 return LIT64(0xFFFFFFFFFFFFFFFF);
1699 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1700 return roundAndPackUint64(aSign
, aSig64
, aSigExtra STATUS_VAR
);
1703 /*----------------------------------------------------------------------------
1704 | Returns the result of converting the single-precision floating-point value
1705 | `a' to the 64-bit unsigned integer format. The conversion is
1706 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1707 | Arithmetic, except that the conversion is always rounded toward zero. If
1708 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1709 | conversion overflows, the largest unsigned integer is returned. If the
1710 | 'a' is negative, the result is rounded and zero is returned; values that do
1711 | not round to zero will raise the inexact flag.
1712 *----------------------------------------------------------------------------*/
1714 uint64
float32_to_uint64_round_to_zero(float32 a STATUS_PARAM
)
1716 signed char current_rounding_mode
= STATUS(float_rounding_mode
);
1717 set_float_rounding_mode(float_round_to_zero STATUS_VAR
);
1718 int64_t v
= float32_to_uint64(a STATUS_VAR
);
1719 set_float_rounding_mode(current_rounding_mode STATUS_VAR
);
1723 /*----------------------------------------------------------------------------
1724 | Returns the result of converting the single-precision floating-point value
1725 | `a' to the 64-bit two's complement integer format. The conversion is
1726 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1727 | Arithmetic, except that the conversion is always rounded toward zero. If
1728 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1729 | conversion overflows, the largest integer with the same sign as `a' is
1731 *----------------------------------------------------------------------------*/
1733 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1736 int_fast16_t aExp
, shiftCount
;
1740 a
= float32_squash_input_denormal(a STATUS_VAR
);
1742 aSig
= extractFloat32Frac( a
);
1743 aExp
= extractFloat32Exp( a
);
1744 aSign
= extractFloat32Sign( a
);
1745 shiftCount
= aExp
- 0xBE;
1746 if ( 0 <= shiftCount
) {
1747 if ( float32_val(a
) != 0xDF000000 ) {
1748 float_raise( float_flag_invalid STATUS_VAR
);
1749 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1750 return LIT64( 0x7FFFFFFFFFFFFFFF );
1753 return (int64_t) LIT64( 0x8000000000000000 );
1755 else if ( aExp
<= 0x7E ) {
1756 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1759 aSig64
= aSig
| 0x00800000;
1761 z
= aSig64
>>( - shiftCount
);
1762 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1763 STATUS(float_exception_flags
) |= float_flag_inexact
;
1765 if ( aSign
) z
= - z
;
1770 /*----------------------------------------------------------------------------
1771 | Returns the result of converting the single-precision floating-point value
1772 | `a' to the double-precision floating-point format. The conversion is
1773 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1775 *----------------------------------------------------------------------------*/
1777 float64
float32_to_float64( float32 a STATUS_PARAM
)
1782 a
= float32_squash_input_denormal(a STATUS_VAR
);
1784 aSig
= extractFloat32Frac( a
);
1785 aExp
= extractFloat32Exp( a
);
1786 aSign
= extractFloat32Sign( a
);
1787 if ( aExp
== 0xFF ) {
1788 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1789 return packFloat64( aSign
, 0x7FF, 0 );
1792 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1793 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1796 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of converting the single-precision floating-point value
1802 | `a' to the extended double-precision floating-point format. The conversion
1803 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1805 *----------------------------------------------------------------------------*/
1807 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1813 a
= float32_squash_input_denormal(a STATUS_VAR
);
1814 aSig
= extractFloat32Frac( a
);
1815 aExp
= extractFloat32Exp( a
);
1816 aSign
= extractFloat32Sign( a
);
1817 if ( aExp
== 0xFF ) {
1818 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1819 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1822 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1823 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1826 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1830 /*----------------------------------------------------------------------------
1831 | Returns the result of converting the single-precision floating-point value
1832 | `a' to the double-precision floating-point format. The conversion is
1833 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1835 *----------------------------------------------------------------------------*/
1837 float128
float32_to_float128( float32 a STATUS_PARAM
)
1843 a
= float32_squash_input_denormal(a STATUS_VAR
);
1844 aSig
= extractFloat32Frac( a
);
1845 aExp
= extractFloat32Exp( a
);
1846 aSign
= extractFloat32Sign( a
);
1847 if ( aExp
== 0xFF ) {
1848 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1849 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1852 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1853 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1856 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1860 /*----------------------------------------------------------------------------
1861 | Rounds the single-precision floating-point value `a' to an integer, and
1862 | returns the result as a single-precision floating-point value. The
1863 | operation is performed according to the IEC/IEEE Standard for Binary
1864 | Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1867 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1871 uint32_t lastBitMask
, roundBitsMask
;
1873 a
= float32_squash_input_denormal(a STATUS_VAR
);
1875 aExp
= extractFloat32Exp( a
);
1876 if ( 0x96 <= aExp
) {
1877 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1878 return propagateFloat32NaN( a
, a STATUS_VAR
);
1882 if ( aExp
<= 0x7E ) {
1883 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1884 STATUS(float_exception_flags
) |= float_flag_inexact
;
1885 aSign
= extractFloat32Sign( a
);
1886 switch ( STATUS(float_rounding_mode
) ) {
1887 case float_round_nearest_even
:
1888 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1889 return packFloat32( aSign
, 0x7F, 0 );
1892 case float_round_ties_away
:
1894 return packFloat32(aSign
, 0x7F, 0);
1897 case float_round_down
:
1898 return make_float32(aSign
? 0xBF800000 : 0);
1899 case float_round_up
:
1900 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1902 return packFloat32( aSign
, 0, 0 );
1905 lastBitMask
<<= 0x96 - aExp
;
1906 roundBitsMask
= lastBitMask
- 1;
1908 switch (STATUS(float_rounding_mode
)) {
1909 case float_round_nearest_even
:
1910 z
+= lastBitMask
>>1;
1911 if ((z
& roundBitsMask
) == 0) {
1915 case float_round_ties_away
:
1916 z
+= lastBitMask
>> 1;
1918 case float_round_to_zero
:
1920 case float_round_up
:
1921 if (!extractFloat32Sign(make_float32(z
))) {
1925 case float_round_down
:
1926 if (extractFloat32Sign(make_float32(z
))) {
1933 z
&= ~ roundBitsMask
;
1934 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1935 return make_float32(z
);
1939 /*----------------------------------------------------------------------------
1940 | Returns the result of adding the absolute values of the single-precision
1941 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1942 | before being returned. `zSign' is ignored if the result is a NaN.
1943 | The addition is performed according to the IEC/IEEE Standard for Binary
1944 | Floating-Point Arithmetic.
1945 *----------------------------------------------------------------------------*/
1947 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1949 int_fast16_t aExp
, bExp
, zExp
;
1950 uint32_t aSig
, bSig
, zSig
;
1951 int_fast16_t expDiff
;
1953 aSig
= extractFloat32Frac( a
);
1954 aExp
= extractFloat32Exp( a
);
1955 bSig
= extractFloat32Frac( b
);
1956 bExp
= extractFloat32Exp( b
);
1957 expDiff
= aExp
- bExp
;
1960 if ( 0 < expDiff
) {
1961 if ( aExp
== 0xFF ) {
1962 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1971 shift32RightJamming( bSig
, expDiff
, &bSig
);
1974 else if ( expDiff
< 0 ) {
1975 if ( bExp
== 0xFF ) {
1976 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1977 return packFloat32( zSign
, 0xFF, 0 );
1985 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1989 if ( aExp
== 0xFF ) {
1990 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1994 if (STATUS(flush_to_zero
)) {
1996 float_raise(float_flag_output_denormal STATUS_VAR
);
1998 return packFloat32(zSign
, 0, 0);
2000 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
2002 zSig
= 0x40000000 + aSig
+ bSig
;
2007 zSig
= ( aSig
+ bSig
)<<1;
2009 if ( (int32_t) zSig
< 0 ) {
2014 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2018 /*----------------------------------------------------------------------------
2019 | Returns the result of subtracting the absolute values of the single-
2020 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2021 | difference is negated before being returned. `zSign' is ignored if the
2022 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2023 | Standard for Binary Floating-Point Arithmetic.
2024 *----------------------------------------------------------------------------*/
2026 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
2028 int_fast16_t aExp
, bExp
, zExp
;
2029 uint32_t aSig
, bSig
, zSig
;
2030 int_fast16_t expDiff
;
2032 aSig
= extractFloat32Frac( a
);
2033 aExp
= extractFloat32Exp( a
);
2034 bSig
= extractFloat32Frac( b
);
2035 bExp
= extractFloat32Exp( b
);
2036 expDiff
= aExp
- bExp
;
2039 if ( 0 < expDiff
) goto aExpBigger
;
2040 if ( expDiff
< 0 ) goto bExpBigger
;
2041 if ( aExp
== 0xFF ) {
2042 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2043 float_raise( float_flag_invalid STATUS_VAR
);
2044 return float32_default_nan
;
2050 if ( bSig
< aSig
) goto aBigger
;
2051 if ( aSig
< bSig
) goto bBigger
;
2052 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
2054 if ( bExp
== 0xFF ) {
2055 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2056 return packFloat32( zSign
^ 1, 0xFF, 0 );
2064 shift32RightJamming( aSig
, - expDiff
, &aSig
);
2070 goto normalizeRoundAndPack
;
2072 if ( aExp
== 0xFF ) {
2073 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2082 shift32RightJamming( bSig
, expDiff
, &bSig
);
2087 normalizeRoundAndPack
:
2089 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2093 /*----------------------------------------------------------------------------
2094 | Returns the result of adding the single-precision floating-point values `a'
2095 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2096 | Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2099 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
2102 a
= float32_squash_input_denormal(a STATUS_VAR
);
2103 b
= float32_squash_input_denormal(b STATUS_VAR
);
2105 aSign
= extractFloat32Sign( a
);
2106 bSign
= extractFloat32Sign( b
);
2107 if ( aSign
== bSign
) {
2108 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2111 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2116 /*----------------------------------------------------------------------------
2117 | Returns the result of subtracting the single-precision floating-point values
2118 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2119 | for Binary Floating-Point Arithmetic.
2120 *----------------------------------------------------------------------------*/
2122 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
2125 a
= float32_squash_input_denormal(a STATUS_VAR
);
2126 b
= float32_squash_input_denormal(b STATUS_VAR
);
2128 aSign
= extractFloat32Sign( a
);
2129 bSign
= extractFloat32Sign( b
);
2130 if ( aSign
== bSign
) {
2131 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2134 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
2139 /*----------------------------------------------------------------------------
2140 | Returns the result of multiplying the single-precision floating-point values
2141 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2142 | for Binary Floating-Point Arithmetic.
2143 *----------------------------------------------------------------------------*/
2145 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
2147 flag aSign
, bSign
, zSign
;
2148 int_fast16_t aExp
, bExp
, zExp
;
2149 uint32_t aSig
, bSig
;
2153 a
= float32_squash_input_denormal(a STATUS_VAR
);
2154 b
= float32_squash_input_denormal(b STATUS_VAR
);
2156 aSig
= extractFloat32Frac( a
);
2157 aExp
= extractFloat32Exp( a
);
2158 aSign
= extractFloat32Sign( a
);
2159 bSig
= extractFloat32Frac( b
);
2160 bExp
= extractFloat32Exp( b
);
2161 bSign
= extractFloat32Sign( b
);
2162 zSign
= aSign
^ bSign
;
2163 if ( aExp
== 0xFF ) {
2164 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2165 return propagateFloat32NaN( a
, b STATUS_VAR
);
2167 if ( ( bExp
| bSig
) == 0 ) {
2168 float_raise( float_flag_invalid STATUS_VAR
);
2169 return float32_default_nan
;
2171 return packFloat32( zSign
, 0xFF, 0 );
2173 if ( bExp
== 0xFF ) {
2174 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2175 if ( ( aExp
| aSig
) == 0 ) {
2176 float_raise( float_flag_invalid STATUS_VAR
);
2177 return float32_default_nan
;
2179 return packFloat32( zSign
, 0xFF, 0 );
2182 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2183 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2186 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2187 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2189 zExp
= aExp
+ bExp
- 0x7F;
2190 aSig
= ( aSig
| 0x00800000 )<<7;
2191 bSig
= ( bSig
| 0x00800000 )<<8;
2192 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2194 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2198 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of dividing the single-precision floating-point value `a'
2204 | by the corresponding value `b'. The operation is performed according to the
2205 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2208 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
2210 flag aSign
, bSign
, zSign
;
2211 int_fast16_t aExp
, bExp
, zExp
;
2212 uint32_t aSig
, bSig
, zSig
;
2213 a
= float32_squash_input_denormal(a STATUS_VAR
);
2214 b
= float32_squash_input_denormal(b STATUS_VAR
);
2216 aSig
= extractFloat32Frac( a
);
2217 aExp
= extractFloat32Exp( a
);
2218 aSign
= extractFloat32Sign( a
);
2219 bSig
= extractFloat32Frac( b
);
2220 bExp
= extractFloat32Exp( b
);
2221 bSign
= extractFloat32Sign( b
);
2222 zSign
= aSign
^ bSign
;
2223 if ( aExp
== 0xFF ) {
2224 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2225 if ( bExp
== 0xFF ) {
2226 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2227 float_raise( float_flag_invalid STATUS_VAR
);
2228 return float32_default_nan
;
2230 return packFloat32( zSign
, 0xFF, 0 );
2232 if ( bExp
== 0xFF ) {
2233 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2234 return packFloat32( zSign
, 0, 0 );
2238 if ( ( aExp
| aSig
) == 0 ) {
2239 float_raise( float_flag_invalid STATUS_VAR
);
2240 return float32_default_nan
;
2242 float_raise( float_flag_divbyzero STATUS_VAR
);
2243 return packFloat32( zSign
, 0xFF, 0 );
2245 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2248 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2249 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2251 zExp
= aExp
- bExp
+ 0x7D;
2252 aSig
= ( aSig
| 0x00800000 )<<7;
2253 bSig
= ( bSig
| 0x00800000 )<<8;
2254 if ( bSig
<= ( aSig
+ aSig
) ) {
2258 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2259 if ( ( zSig
& 0x3F ) == 0 ) {
2260 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2262 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2266 /*----------------------------------------------------------------------------
2267 | Returns the remainder of the single-precision floating-point value `a'
2268 | with respect to the corresponding value `b'. The operation is performed
2269 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2272 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2275 int_fast16_t aExp
, bExp
, expDiff
;
2276 uint32_t aSig
, bSig
;
2278 uint64_t aSig64
, bSig64
, q64
;
2279 uint32_t alternateASig
;
2281 a
= float32_squash_input_denormal(a STATUS_VAR
);
2282 b
= float32_squash_input_denormal(b STATUS_VAR
);
2284 aSig
= extractFloat32Frac( a
);
2285 aExp
= extractFloat32Exp( a
);
2286 aSign
= extractFloat32Sign( a
);
2287 bSig
= extractFloat32Frac( b
);
2288 bExp
= extractFloat32Exp( b
);
2289 if ( aExp
== 0xFF ) {
2290 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2291 return propagateFloat32NaN( a
, b STATUS_VAR
);
2293 float_raise( float_flag_invalid STATUS_VAR
);
2294 return float32_default_nan
;
2296 if ( bExp
== 0xFF ) {
2297 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2302 float_raise( float_flag_invalid STATUS_VAR
);
2303 return float32_default_nan
;
2305 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2308 if ( aSig
== 0 ) return a
;
2309 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2311 expDiff
= aExp
- bExp
;
2314 if ( expDiff
< 32 ) {
2317 if ( expDiff
< 0 ) {
2318 if ( expDiff
< -1 ) return a
;
2321 q
= ( bSig
<= aSig
);
2322 if ( q
) aSig
-= bSig
;
2323 if ( 0 < expDiff
) {
2324 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2327 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2335 if ( bSig
<= aSig
) aSig
-= bSig
;
2336 aSig64
= ( (uint64_t) aSig
)<<40;
2337 bSig64
= ( (uint64_t) bSig
)<<40;
2339 while ( 0 < expDiff
) {
2340 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2341 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2342 aSig64
= - ( ( bSig
* q64
)<<38 );
2346 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2347 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2348 q
= q64
>>( 64 - expDiff
);
2350 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2353 alternateASig
= aSig
;
2356 } while ( 0 <= (int32_t) aSig
);
2357 sigMean
= aSig
+ alternateASig
;
2358 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2359 aSig
= alternateASig
;
2361 zSign
= ( (int32_t) aSig
< 0 );
2362 if ( zSign
) aSig
= - aSig
;
2363 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2367 /*----------------------------------------------------------------------------
2368 | Returns the result of multiplying the single-precision floating-point values
2369 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2370 | multiplication. The operation is performed according to the IEC/IEEE
2371 | Standard for Binary Floating-Point Arithmetic 754-2008.
2372 | The flags argument allows the caller to select negation of the
2373 | addend, the intermediate product, or the final result. (The difference
2374 | between this and having the caller do a separate negation is that negating
2375 | externally will flip the sign bit on NaNs.)
2376 *----------------------------------------------------------------------------*/
2378 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags STATUS_PARAM
)
2380 flag aSign
, bSign
, cSign
, zSign
;
2381 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2382 uint32_t aSig
, bSig
, cSig
;
2383 flag pInf
, pZero
, pSign
;
2384 uint64_t pSig64
, cSig64
, zSig64
;
2387 flag signflip
, infzero
;
2389 a
= float32_squash_input_denormal(a STATUS_VAR
);
2390 b
= float32_squash_input_denormal(b STATUS_VAR
);
2391 c
= float32_squash_input_denormal(c STATUS_VAR
);
2392 aSig
= extractFloat32Frac(a
);
2393 aExp
= extractFloat32Exp(a
);
2394 aSign
= extractFloat32Sign(a
);
2395 bSig
= extractFloat32Frac(b
);
2396 bExp
= extractFloat32Exp(b
);
2397 bSign
= extractFloat32Sign(b
);
2398 cSig
= extractFloat32Frac(c
);
2399 cExp
= extractFloat32Exp(c
);
2400 cSign
= extractFloat32Sign(c
);
2402 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2403 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2405 /* It is implementation-defined whether the cases of (0,inf,qnan)
2406 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2407 * they return if they do), so we have to hand this information
2408 * off to the target-specific pick-a-NaN routine.
2410 if (((aExp
== 0xff) && aSig
) ||
2411 ((bExp
== 0xff) && bSig
) ||
2412 ((cExp
== 0xff) && cSig
)) {
2413 return propagateFloat32MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
2417 float_raise(float_flag_invalid STATUS_VAR
);
2418 return float32_default_nan
;
2421 if (flags
& float_muladd_negate_c
) {
2425 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2427 /* Work out the sign and type of the product */
2428 pSign
= aSign
^ bSign
;
2429 if (flags
& float_muladd_negate_product
) {
2432 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2433 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2436 if (pInf
&& (pSign
^ cSign
)) {
2437 /* addition of opposite-signed infinities => InvalidOperation */
2438 float_raise(float_flag_invalid STATUS_VAR
);
2439 return float32_default_nan
;
2441 /* Otherwise generate an infinity of the same sign */
2442 return packFloat32(cSign
^ signflip
, 0xff, 0);
2446 return packFloat32(pSign
^ signflip
, 0xff, 0);
2452 /* Adding two exact zeroes */
2453 if (pSign
== cSign
) {
2455 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
2460 return packFloat32(zSign
^ signflip
, 0, 0);
2462 /* Exact zero plus a denorm */
2463 if (STATUS(flush_to_zero
)) {
2464 float_raise(float_flag_output_denormal STATUS_VAR
);
2465 return packFloat32(cSign
^ signflip
, 0, 0);
2468 /* Zero plus something non-zero : just return the something */
2469 if (flags
& float_muladd_halve_result
) {
2471 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2473 /* Subtract one to halve, and one again because roundAndPackFloat32
2474 * wants one less than the true exponent.
2477 cSig
= (cSig
| 0x00800000) << 7;
2478 return roundAndPackFloat32(cSign
^ signflip
, cExp
, cSig STATUS_VAR
);
2480 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2484 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2487 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2490 /* Calculate the actual result a * b + c */
2492 /* Multiply first; this is easy. */
2493 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2494 * because we want the true exponent, not the "one-less-than"
2495 * flavour that roundAndPackFloat32() takes.
2497 pExp
= aExp
+ bExp
- 0x7e;
2498 aSig
= (aSig
| 0x00800000) << 7;
2499 bSig
= (bSig
| 0x00800000) << 8;
2500 pSig64
= (uint64_t)aSig
* bSig
;
2501 if ((int64_t)(pSig64
<< 1) >= 0) {
2506 zSign
= pSign
^ signflip
;
2508 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2513 /* Throw out the special case of c being an exact zero now */
2514 shift64RightJamming(pSig64
, 32, &pSig64
);
2516 if (flags
& float_muladd_halve_result
) {
2519 return roundAndPackFloat32(zSign
, pExp
- 1,
2522 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2525 cSig64
= (uint64_t)cSig
<< (62 - 23);
2526 cSig64
|= LIT64(0x4000000000000000);
2527 expDiff
= pExp
- cExp
;
2529 if (pSign
== cSign
) {
2532 /* scale c to match p */
2533 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2535 } else if (expDiff
< 0) {
2536 /* scale p to match c */
2537 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2540 /* no scaling needed */
2543 /* Add significands and make sure explicit bit ends up in posn 62 */
2544 zSig64
= pSig64
+ cSig64
;
2545 if ((int64_t)zSig64
< 0) {
2546 shift64RightJamming(zSig64
, 1, &zSig64
);
2553 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2554 zSig64
= pSig64
- cSig64
;
2556 } else if (expDiff
< 0) {
2557 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2558 zSig64
= cSig64
- pSig64
;
2563 if (cSig64
< pSig64
) {
2564 zSig64
= pSig64
- cSig64
;
2565 } else if (pSig64
< cSig64
) {
2566 zSig64
= cSig64
- pSig64
;
2571 if (STATUS(float_rounding_mode
) == float_round_down
) {
2574 return packFloat32(zSign
, 0, 0);
2578 /* Normalize to put the explicit bit back into bit 62. */
2579 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2580 zSig64
<<= shiftcount
;
2583 if (flags
& float_muladd_halve_result
) {
2587 shift64RightJamming(zSig64
, 32, &zSig64
);
2588 return roundAndPackFloat32(zSign
, zExp
, zSig64 STATUS_VAR
);
2592 /*----------------------------------------------------------------------------
2593 | Returns the square root of the single-precision floating-point value `a'.
2594 | The operation is performed according to the IEC/IEEE Standard for Binary
2595 | Floating-Point Arithmetic.
2596 *----------------------------------------------------------------------------*/
2598 float32
float32_sqrt( float32 a STATUS_PARAM
)
2601 int_fast16_t aExp
, zExp
;
2602 uint32_t aSig
, zSig
;
2604 a
= float32_squash_input_denormal(a STATUS_VAR
);
2606 aSig
= extractFloat32Frac( a
);
2607 aExp
= extractFloat32Exp( a
);
2608 aSign
= extractFloat32Sign( a
);
2609 if ( aExp
== 0xFF ) {
2610 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2611 if ( ! aSign
) return a
;
2612 float_raise( float_flag_invalid STATUS_VAR
);
2613 return float32_default_nan
;
2616 if ( ( aExp
| aSig
) == 0 ) return a
;
2617 float_raise( float_flag_invalid STATUS_VAR
);
2618 return float32_default_nan
;
2621 if ( aSig
== 0 ) return float32_zero
;
2622 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2624 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2625 aSig
= ( aSig
| 0x00800000 )<<8;
2626 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2627 if ( ( zSig
& 0x7F ) <= 5 ) {
2633 term
= ( (uint64_t) zSig
) * zSig
;
2634 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2635 while ( (int64_t) rem
< 0 ) {
2637 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2639 zSig
|= ( rem
!= 0 );
2641 shift32RightJamming( zSig
, 1, &zSig
);
2643 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2647 /*----------------------------------------------------------------------------
2648 | Returns the binary exponential of the single-precision floating-point value
2649 | `a'. The operation is performed according to the IEC/IEEE Standard for
2650 | Binary Floating-Point Arithmetic.
2652 | Uses the following identities:
2654 | 1. -------------------------------------------------------------------------
2658 | 2. -------------------------------------------------------------------------
2661 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2663 *----------------------------------------------------------------------------*/
2665 static const float64 float32_exp2_coefficients
[15] =
2667 const_float64( 0x3ff0000000000000ll
), /* 1 */
2668 const_float64( 0x3fe0000000000000ll
), /* 2 */
2669 const_float64( 0x3fc5555555555555ll
), /* 3 */
2670 const_float64( 0x3fa5555555555555ll
), /* 4 */
2671 const_float64( 0x3f81111111111111ll
), /* 5 */
2672 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2673 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2674 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2675 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2676 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2677 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2678 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2679 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2680 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2681 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2684 float32
float32_exp2( float32 a STATUS_PARAM
)
2691 a
= float32_squash_input_denormal(a STATUS_VAR
);
2693 aSig
= extractFloat32Frac( a
);
2694 aExp
= extractFloat32Exp( a
);
2695 aSign
= extractFloat32Sign( a
);
2697 if ( aExp
== 0xFF) {
2698 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2699 return (aSign
) ? float32_zero
: a
;
2702 if (aSig
== 0) return float32_one
;
2705 float_raise( float_flag_inexact STATUS_VAR
);
2707 /* ******************************* */
2708 /* using float64 for approximation */
2709 /* ******************************* */
2710 x
= float32_to_float64(a STATUS_VAR
);
2711 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2715 for (i
= 0 ; i
< 15 ; i
++) {
2718 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2719 r
= float64_add(r
, f STATUS_VAR
);
2721 xn
= float64_mul(xn
, x STATUS_VAR
);
2724 return float64_to_float32(r
, status
);
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary log of the single-precision floating-point value `a'.
2729 | The operation is performed according to the IEC/IEEE Standard for Binary
2730 | Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2732 float32
float32_log2( float32 a STATUS_PARAM
)
2736 uint32_t aSig
, zSig
, i
;
2738 a
= float32_squash_input_denormal(a STATUS_VAR
);
2739 aSig
= extractFloat32Frac( a
);
2740 aExp
= extractFloat32Exp( a
);
2741 aSign
= extractFloat32Sign( a
);
2744 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2745 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2748 float_raise( float_flag_invalid STATUS_VAR
);
2749 return float32_default_nan
;
2751 if ( aExp
== 0xFF ) {
2752 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2761 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2762 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2763 if ( aSig
& 0x01000000 ) {
2772 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2775 /*----------------------------------------------------------------------------
2776 | Returns 1 if the single-precision floating-point value `a' is equal to
2777 | the corresponding value `b', and 0 otherwise. The invalid exception is
2778 | raised if either operand is a NaN. Otherwise, the comparison is performed
2779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2780 *----------------------------------------------------------------------------*/
2782 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2785 a
= float32_squash_input_denormal(a STATUS_VAR
);
2786 b
= float32_squash_input_denormal(b STATUS_VAR
);
2788 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2789 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2791 float_raise( float_flag_invalid STATUS_VAR
);
2794 av
= float32_val(a
);
2795 bv
= float32_val(b
);
2796 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2799 /*----------------------------------------------------------------------------
2800 | Returns 1 if the single-precision floating-point value `a' is less than
2801 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2802 | exception is raised if either operand is a NaN. The comparison is performed
2803 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2804 *----------------------------------------------------------------------------*/
2806 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2810 a
= float32_squash_input_denormal(a STATUS_VAR
);
2811 b
= float32_squash_input_denormal(b STATUS_VAR
);
2813 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2814 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2816 float_raise( float_flag_invalid STATUS_VAR
);
2819 aSign
= extractFloat32Sign( a
);
2820 bSign
= extractFloat32Sign( b
);
2821 av
= float32_val(a
);
2822 bv
= float32_val(b
);
2823 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2824 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than
2830 | the corresponding value `b', and 0 otherwise. The invalid exception is
2831 | raised if either operand is a NaN. The comparison is performed according
2832 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2835 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2839 a
= float32_squash_input_denormal(a STATUS_VAR
);
2840 b
= float32_squash_input_denormal(b STATUS_VAR
);
2842 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2843 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2845 float_raise( float_flag_invalid STATUS_VAR
);
2848 aSign
= extractFloat32Sign( a
);
2849 bSign
= extractFloat32Sign( b
);
2850 av
= float32_val(a
);
2851 bv
= float32_val(b
);
2852 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2853 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2857 /*----------------------------------------------------------------------------
2858 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2859 | be compared, and 0 otherwise. The invalid exception is raised if either
2860 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2861 | Standard for Binary Floating-Point Arithmetic.
2862 *----------------------------------------------------------------------------*/
2864 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2866 a
= float32_squash_input_denormal(a STATUS_VAR
);
2867 b
= float32_squash_input_denormal(b STATUS_VAR
);
2869 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2870 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2872 float_raise( float_flag_invalid STATUS_VAR
);
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2881 | exception. The comparison is performed according to the IEC/IEEE Standard
2882 | for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2887 a
= float32_squash_input_denormal(a STATUS_VAR
);
2888 b
= float32_squash_input_denormal(b STATUS_VAR
);
2890 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2891 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2893 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2894 float_raise( float_flag_invalid STATUS_VAR
);
2898 return ( float32_val(a
) == float32_val(b
) ) ||
2899 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than or
2904 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2905 | cause an exception. Otherwise, the comparison is performed according to the
2906 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2909 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2913 a
= float32_squash_input_denormal(a STATUS_VAR
);
2914 b
= float32_squash_input_denormal(b STATUS_VAR
);
2916 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2917 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2919 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2920 float_raise( float_flag_invalid STATUS_VAR
);
2924 aSign
= extractFloat32Sign( a
);
2925 bSign
= extractFloat32Sign( b
);
2926 av
= float32_val(a
);
2927 bv
= float32_val(b
);
2928 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2929 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2933 /*----------------------------------------------------------------------------
2934 | Returns 1 if the single-precision floating-point value `a' is less than
2935 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2936 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2937 | Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2940 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2944 a
= float32_squash_input_denormal(a STATUS_VAR
);
2945 b
= float32_squash_input_denormal(b STATUS_VAR
);
2947 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2948 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2950 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2951 float_raise( float_flag_invalid STATUS_VAR
);
2955 aSign
= extractFloat32Sign( a
);
2956 bSign
= extractFloat32Sign( b
);
2957 av
= float32_val(a
);
2958 bv
= float32_val(b
);
2959 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2960 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2964 /*----------------------------------------------------------------------------
2965 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2966 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2967 | comparison is performed according to the IEC/IEEE Standard for Binary
2968 | Floating-Point Arithmetic.
2969 *----------------------------------------------------------------------------*/
2971 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2973 a
= float32_squash_input_denormal(a STATUS_VAR
);
2974 b
= float32_squash_input_denormal(b STATUS_VAR
);
2976 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2977 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2979 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2980 float_raise( float_flag_invalid STATUS_VAR
);
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 32-bit two's complement integer format. The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic---which means in particular that the conversion is rounded
2992 | according to the current rounding mode. If `a' is a NaN, the largest
2993 | positive integer is returned. Otherwise, if the conversion overflows, the
2994 | largest integer with the same sign as `a' is returned.
2995 *----------------------------------------------------------------------------*/
2997 int32
float64_to_int32( float64 a STATUS_PARAM
)
3000 int_fast16_t aExp
, shiftCount
;
3002 a
= float64_squash_input_denormal(a STATUS_VAR
);
3004 aSig
= extractFloat64Frac( a
);
3005 aExp
= extractFloat64Exp( a
);
3006 aSign
= extractFloat64Sign( a
);
3007 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3008 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3009 shiftCount
= 0x42C - aExp
;
3010 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
3011 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of converting the double-precision floating-point value
3017 | `a' to the 32-bit two's complement integer format. The conversion is
3018 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3019 | Arithmetic, except that the conversion is always rounded toward zero.
3020 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3021 | the conversion overflows, the largest integer with the same sign as `a' is
3023 *----------------------------------------------------------------------------*/
3025 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
3028 int_fast16_t aExp
, shiftCount
;
3029 uint64_t aSig
, savedASig
;
3031 a
= float64_squash_input_denormal(a STATUS_VAR
);
3033 aSig
= extractFloat64Frac( a
);
3034 aExp
= extractFloat64Exp( a
);
3035 aSign
= extractFloat64Sign( a
);
3036 if ( 0x41E < aExp
) {
3037 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
3040 else if ( aExp
< 0x3FF ) {
3041 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3044 aSig
|= LIT64( 0x0010000000000000 );
3045 shiftCount
= 0x433 - aExp
;
3047 aSig
>>= shiftCount
;
3049 if ( aSign
) z
= - z
;
3050 if ( ( z
< 0 ) ^ aSign
) {
3052 float_raise( float_flag_invalid STATUS_VAR
);
3053 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3055 if ( ( aSig
<<shiftCount
) != savedASig
) {
3056 STATUS(float_exception_flags
) |= float_flag_inexact
;
3062 /*----------------------------------------------------------------------------
3063 | Returns the result of converting the double-precision floating-point value
3064 | `a' to the 16-bit two's complement integer format. The conversion is
3065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3066 | Arithmetic, except that the conversion is always rounded toward zero.
3067 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3068 | the conversion overflows, the largest integer with the same sign as `a' is
3070 *----------------------------------------------------------------------------*/
3072 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM
)
3075 int_fast16_t aExp
, shiftCount
;
3076 uint64_t aSig
, savedASig
;
3079 aSig
= extractFloat64Frac( a
);
3080 aExp
= extractFloat64Exp( a
);
3081 aSign
= extractFloat64Sign( a
);
3082 if ( 0x40E < aExp
) {
3083 if ( ( aExp
== 0x7FF ) && aSig
) {
3088 else if ( aExp
< 0x3FF ) {
3089 if ( aExp
|| aSig
) {
3090 STATUS(float_exception_flags
) |= float_flag_inexact
;
3094 aSig
|= LIT64( 0x0010000000000000 );
3095 shiftCount
= 0x433 - aExp
;
3097 aSig
>>= shiftCount
;
3102 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
3104 float_raise( float_flag_invalid STATUS_VAR
);
3105 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
3107 if ( ( aSig
<<shiftCount
) != savedASig
) {
3108 STATUS(float_exception_flags
) |= float_flag_inexact
;
3113 /*----------------------------------------------------------------------------
3114 | Returns the result of converting the double-precision floating-point value
3115 | `a' to the 64-bit two's complement integer format. The conversion is
3116 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3117 | Arithmetic---which means in particular that the conversion is rounded
3118 | according to the current rounding mode. If `a' is a NaN, the largest
3119 | positive integer is returned. Otherwise, if the conversion overflows, the
3120 | largest integer with the same sign as `a' is returned.
3121 *----------------------------------------------------------------------------*/
3123 int64
float64_to_int64( float64 a STATUS_PARAM
)
3126 int_fast16_t aExp
, shiftCount
;
3127 uint64_t aSig
, aSigExtra
;
3128 a
= float64_squash_input_denormal(a STATUS_VAR
);
3130 aSig
= extractFloat64Frac( a
);
3131 aExp
= extractFloat64Exp( a
);
3132 aSign
= extractFloat64Sign( a
);
3133 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3134 shiftCount
= 0x433 - aExp
;
3135 if ( shiftCount
<= 0 ) {
3136 if ( 0x43E < aExp
) {
3137 float_raise( float_flag_invalid STATUS_VAR
);
3139 || ( ( aExp
== 0x7FF )
3140 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3142 return LIT64( 0x7FFFFFFFFFFFFFFF );
3144 return (int64_t) LIT64( 0x8000000000000000 );
3147 aSig
<<= - shiftCount
;
3150 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3152 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the double-precision floating-point value
3158 | `a' to the 64-bit two's complement integer format. The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 | Arithmetic, except that the conversion is always rounded toward zero.
3161 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3162 | the conversion overflows, the largest integer with the same sign as `a' is
3164 *----------------------------------------------------------------------------*/
3166 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
3169 int_fast16_t aExp
, shiftCount
;
3172 a
= float64_squash_input_denormal(a STATUS_VAR
);
3174 aSig
= extractFloat64Frac( a
);
3175 aExp
= extractFloat64Exp( a
);
3176 aSign
= extractFloat64Sign( a
);
3177 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3178 shiftCount
= aExp
- 0x433;
3179 if ( 0 <= shiftCount
) {
3180 if ( 0x43E <= aExp
) {
3181 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3182 float_raise( float_flag_invalid STATUS_VAR
);
3184 || ( ( aExp
== 0x7FF )
3185 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3187 return LIT64( 0x7FFFFFFFFFFFFFFF );
3190 return (int64_t) LIT64( 0x8000000000000000 );
3192 z
= aSig
<<shiftCount
;
3195 if ( aExp
< 0x3FE ) {
3196 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3199 z
= aSig
>>( - shiftCount
);
3200 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3201 STATUS(float_exception_flags
) |= float_flag_inexact
;
3204 if ( aSign
) z
= - z
;
3209 /*----------------------------------------------------------------------------
3210 | Returns the result of converting the double-precision floating-point value
3211 | `a' to the single-precision floating-point format. The conversion is
3212 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3214 *----------------------------------------------------------------------------*/
3216 float32
float64_to_float32( float64 a STATUS_PARAM
)
3222 a
= float64_squash_input_denormal(a STATUS_VAR
);
3224 aSig
= extractFloat64Frac( a
);
3225 aExp
= extractFloat64Exp( a
);
3226 aSign
= extractFloat64Sign( a
);
3227 if ( aExp
== 0x7FF ) {
3228 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3229 return packFloat32( aSign
, 0xFF, 0 );
3231 shift64RightJamming( aSig
, 22, &aSig
);
3233 if ( aExp
|| zSig
) {
3237 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
3242 /*----------------------------------------------------------------------------
3243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3244 | half-precision floating-point value, returning the result. After being
3245 | shifted into the proper positions, the three fields are simply added
3246 | together to form the result. This means that any integer portion of `zSig'
3247 | will be added into the exponent. Since a properly normalized significand
3248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3249 | than the desired result exponent whenever `zSig' is a complete, normalized
3251 *----------------------------------------------------------------------------*/
3252 static float16
packFloat16(flag zSign
, int_fast16_t zExp
, uint16_t zSig
)
3254 return make_float16(
3255 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3258 /*----------------------------------------------------------------------------
3259 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3260 | and significand `zSig', and returns the proper half-precision floating-
3261 | point value corresponding to the abstract input. Ordinarily, the abstract
3262 | value is simply rounded and packed into the half-precision format, with
3263 | the inexact exception raised if the abstract input cannot be represented
3264 | exactly. However, if the abstract value is too large, the overflow and
3265 | inexact exceptions are raised and an infinity or maximal finite value is
3266 | returned. If the abstract value is too small, the input value is rounded to
3267 | a subnormal number, and the underflow and inexact exceptions are raised if
3268 | the abstract input cannot be represented exactly as a subnormal half-
3269 | precision floating-point number.
3270 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3271 | ARM-style "alternative representation", which omits the NaN and Inf
3272 | encodings in order to raise the maximum representable exponent by one.
3273 | The input significand `zSig' has its binary point between bits 22
3274 | and 23, which is 13 bits to the left of the usual location. This shifted
3275 | significand must be normalized or smaller. If `zSig' is not normalized,
3276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3277 | and it must not require rounding. In the usual case that `zSig' is
3278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3279 | Note the slightly odd position of the binary point in zSig compared with the
3280 | other roundAndPackFloat functions. This should probably be fixed if we
3281 | need to implement more float16 routines than just conversion.
3282 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3283 | Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3286 static float32
roundAndPackFloat16(flag zSign
, int_fast16_t zExp
,
3287 uint32_t zSig
, flag ieee STATUS_PARAM
)
3289 int maxexp
= ieee
? 29 : 30;
3292 bool rounding_bumps_exp
;
3293 bool is_tiny
= false;
3295 /* Calculate the mask of bits of the mantissa which are not
3296 * representable in half-precision and will be lost.
3299 /* Will be denormal in halfprec */
3305 /* Normal number in halfprec */
3309 switch (STATUS(float_rounding_mode
)) {
3310 case float_round_nearest_even
:
3311 increment
= (mask
+ 1) >> 1;
3312 if ((zSig
& mask
) == increment
) {
3313 increment
= zSig
& (increment
<< 1);
3316 case float_round_ties_away
:
3317 increment
= (mask
+ 1) >> 1;
3319 case float_round_up
:
3320 increment
= zSign
? 0 : mask
;
3322 case float_round_down
:
3323 increment
= zSign
? mask
: 0;
3325 default: /* round_to_zero */
3330 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3332 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3334 float_raise(float_flag_overflow
| float_flag_inexact STATUS_VAR
);
3335 return packFloat16(zSign
, 0x1f, 0);
3337 float_raise(float_flag_invalid STATUS_VAR
);
3338 return packFloat16(zSign
, 0x1f, 0x3ff);
3343 /* Note that flush-to-zero does not affect half-precision results */
3345 (STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
3347 || (!rounding_bumps_exp
);
3350 float_raise(float_flag_inexact STATUS_VAR
);
3352 float_raise(float_flag_underflow STATUS_VAR
);
3357 if (rounding_bumps_exp
) {
3363 return packFloat16(zSign
, 0, 0);
3369 return packFloat16(zSign
, zExp
, zSig
>> 13);
3372 static void normalizeFloat16Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
,
3375 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3376 *zSigPtr
= aSig
<< shiftCount
;
3377 *zExpPtr
= 1 - shiftCount
;
3380 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3381 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3383 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
3389 aSign
= extractFloat16Sign(a
);
3390 aExp
= extractFloat16Exp(a
);
3391 aSig
= extractFloat16Frac(a
);
3393 if (aExp
== 0x1f && ieee
) {
3395 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3397 return packFloat32(aSign
, 0xff, 0);
3401 return packFloat32(aSign
, 0, 0);
3404 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3407 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3410 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
3416 a
= float32_squash_input_denormal(a STATUS_VAR
);
3418 aSig
= extractFloat32Frac( a
);
3419 aExp
= extractFloat32Exp( a
);
3420 aSign
= extractFloat32Sign( a
);
3421 if ( aExp
== 0xFF ) {
3423 /* Input is a NaN */
3425 float_raise(float_flag_invalid STATUS_VAR
);
3426 return packFloat16(aSign
, 0, 0);
3428 return commonNaNToFloat16(
3429 float32ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3433 float_raise(float_flag_invalid STATUS_VAR
);
3434 return packFloat16(aSign
, 0x1f, 0x3ff);
3436 return packFloat16(aSign
, 0x1f, 0);
3438 if (aExp
== 0 && aSig
== 0) {
3439 return packFloat16(aSign
, 0, 0);
3441 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3442 * even if the input is denormal; however this is harmless because
3443 * the largest possible single-precision denormal is still smaller
3444 * than the smallest representable half-precision denormal, and so we
3445 * will end up ignoring aSig and returning via the "always return zero"
3451 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee STATUS_VAR
);
3454 float64
float16_to_float64(float16 a
, flag ieee STATUS_PARAM
)
3460 aSign
= extractFloat16Sign(a
);
3461 aExp
= extractFloat16Exp(a
);
3462 aSig
= extractFloat16Frac(a
);
3464 if (aExp
== 0x1f && ieee
) {
3466 return commonNaNToFloat64(
3467 float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3469 return packFloat64(aSign
, 0x7ff, 0);
3473 return packFloat64(aSign
, 0, 0);
3476 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3479 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3482 float16
float64_to_float16(float64 a
, flag ieee STATUS_PARAM
)
3489 a
= float64_squash_input_denormal(a STATUS_VAR
);
3491 aSig
= extractFloat64Frac(a
);
3492 aExp
= extractFloat64Exp(a
);
3493 aSign
= extractFloat64Sign(a
);
3494 if (aExp
== 0x7FF) {
3496 /* Input is a NaN */
3498 float_raise(float_flag_invalid STATUS_VAR
);
3499 return packFloat16(aSign
, 0, 0);
3501 return commonNaNToFloat16(
3502 float64ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3506 float_raise(float_flag_invalid STATUS_VAR
);
3507 return packFloat16(aSign
, 0x1f, 0x3ff);
3509 return packFloat16(aSign
, 0x1f, 0);
3511 shift64RightJamming(aSig
, 29, &aSig
);
3513 if (aExp
== 0 && zSig
== 0) {
3514 return packFloat16(aSign
, 0, 0);
3516 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3517 * even if the input is denormal; however this is harmless because
3518 * the largest possible single-precision denormal is still smaller
3519 * than the smallest representable half-precision denormal, and so we
3520 * will end up ignoring aSig and returning via the "always return zero"
3526 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee STATUS_VAR
);
3529 /*----------------------------------------------------------------------------
3530 | Returns the result of converting the double-precision floating-point value
3531 | `a' to the extended double-precision floating-point format. The conversion
3532 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3534 *----------------------------------------------------------------------------*/
3536 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
3542 a
= float64_squash_input_denormal(a STATUS_VAR
);
3543 aSig
= extractFloat64Frac( a
);
3544 aExp
= extractFloat64Exp( a
);
3545 aSign
= extractFloat64Sign( a
);
3546 if ( aExp
== 0x7FF ) {
3547 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3548 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3551 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3552 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3556 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3560 /*----------------------------------------------------------------------------
3561 | Returns the result of converting the double-precision floating-point value
3562 | `a' to the quadruple-precision floating-point format. The conversion is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3565 *----------------------------------------------------------------------------*/
3567 float128
float64_to_float128( float64 a STATUS_PARAM
)
3571 uint64_t aSig
, zSig0
, zSig1
;
3573 a
= float64_squash_input_denormal(a STATUS_VAR
);
3574 aSig
= extractFloat64Frac( a
);
3575 aExp
= extractFloat64Exp( a
);
3576 aSign
= extractFloat64Sign( a
);
3577 if ( aExp
== 0x7FF ) {
3578 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3579 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3582 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3583 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3586 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3587 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3591 /*----------------------------------------------------------------------------
3592 | Rounds the double-precision floating-point value `a' to an integer, and
3593 | returns the result as a double-precision floating-point value. The
3594 | operation is performed according to the IEC/IEEE Standard for Binary
3595 | Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3598 float64
float64_round_to_int( float64 a STATUS_PARAM
)
3602 uint64_t lastBitMask
, roundBitsMask
;
3604 a
= float64_squash_input_denormal(a STATUS_VAR
);
3606 aExp
= extractFloat64Exp( a
);
3607 if ( 0x433 <= aExp
) {
3608 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3609 return propagateFloat64NaN( a
, a STATUS_VAR
);
3613 if ( aExp
< 0x3FF ) {
3614 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3615 STATUS(float_exception_flags
) |= float_flag_inexact
;
3616 aSign
= extractFloat64Sign( a
);
3617 switch ( STATUS(float_rounding_mode
) ) {
3618 case float_round_nearest_even
:
3619 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3620 return packFloat64( aSign
, 0x3FF, 0 );
3623 case float_round_ties_away
:
3624 if (aExp
== 0x3FE) {
3625 return packFloat64(aSign
, 0x3ff, 0);
3628 case float_round_down
:
3629 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3630 case float_round_up
:
3631 return make_float64(
3632 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3634 return packFloat64( aSign
, 0, 0 );
3637 lastBitMask
<<= 0x433 - aExp
;
3638 roundBitsMask
= lastBitMask
- 1;
3640 switch (STATUS(float_rounding_mode
)) {
3641 case float_round_nearest_even
:
3642 z
+= lastBitMask
>> 1;
3643 if ((z
& roundBitsMask
) == 0) {
3647 case float_round_ties_away
:
3648 z
+= lastBitMask
>> 1;
3650 case float_round_to_zero
:
3652 case float_round_up
:
3653 if (!extractFloat64Sign(make_float64(z
))) {
3657 case float_round_down
:
3658 if (extractFloat64Sign(make_float64(z
))) {
3665 z
&= ~ roundBitsMask
;
3666 if ( z
!= float64_val(a
) )
3667 STATUS(float_exception_flags
) |= float_flag_inexact
;
3668 return make_float64(z
);
3672 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3676 oldmode
= STATUS(float_rounding_mode
);
3677 STATUS(float_rounding_mode
) = float_round_to_zero
;
3678 res
= float64_round_to_int(a STATUS_VAR
);
3679 STATUS(float_rounding_mode
) = oldmode
;
3683 /*----------------------------------------------------------------------------
3684 | Returns the result of adding the absolute values of the double-precision
3685 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3686 | before being returned. `zSign' is ignored if the result is a NaN.
3687 | The addition is performed according to the IEC/IEEE Standard for Binary
3688 | Floating-Point Arithmetic.
3689 *----------------------------------------------------------------------------*/
3691 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3693 int_fast16_t aExp
, bExp
, zExp
;
3694 uint64_t aSig
, bSig
, zSig
;
3695 int_fast16_t expDiff
;
3697 aSig
= extractFloat64Frac( a
);
3698 aExp
= extractFloat64Exp( a
);
3699 bSig
= extractFloat64Frac( b
);
3700 bExp
= extractFloat64Exp( b
);
3701 expDiff
= aExp
- bExp
;
3704 if ( 0 < expDiff
) {
3705 if ( aExp
== 0x7FF ) {
3706 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3713 bSig
|= LIT64( 0x2000000000000000 );
3715 shift64RightJamming( bSig
, expDiff
, &bSig
);
3718 else if ( expDiff
< 0 ) {
3719 if ( bExp
== 0x7FF ) {
3720 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3721 return packFloat64( zSign
, 0x7FF, 0 );
3727 aSig
|= LIT64( 0x2000000000000000 );
3729 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3733 if ( aExp
== 0x7FF ) {
3734 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3738 if (STATUS(flush_to_zero
)) {
3740 float_raise(float_flag_output_denormal STATUS_VAR
);
3742 return packFloat64(zSign
, 0, 0);
3744 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3746 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3750 aSig
|= LIT64( 0x2000000000000000 );
3751 zSig
= ( aSig
+ bSig
)<<1;
3753 if ( (int64_t) zSig
< 0 ) {
3758 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3762 /*----------------------------------------------------------------------------
3763 | Returns the result of subtracting the absolute values of the double-
3764 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3765 | difference is negated before being returned. `zSign' is ignored if the
3766 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3767 | Standard for Binary Floating-Point Arithmetic.
3768 *----------------------------------------------------------------------------*/
3770 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3772 int_fast16_t aExp
, bExp
, zExp
;
3773 uint64_t aSig
, bSig
, zSig
;
3774 int_fast16_t expDiff
;
3776 aSig
= extractFloat64Frac( a
);
3777 aExp
= extractFloat64Exp( a
);
3778 bSig
= extractFloat64Frac( b
);
3779 bExp
= extractFloat64Exp( b
);
3780 expDiff
= aExp
- bExp
;
3783 if ( 0 < expDiff
) goto aExpBigger
;
3784 if ( expDiff
< 0 ) goto bExpBigger
;
3785 if ( aExp
== 0x7FF ) {
3786 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3787 float_raise( float_flag_invalid STATUS_VAR
);
3788 return float64_default_nan
;
3794 if ( bSig
< aSig
) goto aBigger
;
3795 if ( aSig
< bSig
) goto bBigger
;
3796 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3798 if ( bExp
== 0x7FF ) {
3799 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3800 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3806 aSig
|= LIT64( 0x4000000000000000 );
3808 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3809 bSig
|= LIT64( 0x4000000000000000 );
3814 goto normalizeRoundAndPack
;
3816 if ( aExp
== 0x7FF ) {
3817 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3824 bSig
|= LIT64( 0x4000000000000000 );
3826 shift64RightJamming( bSig
, expDiff
, &bSig
);
3827 aSig
|= LIT64( 0x4000000000000000 );
3831 normalizeRoundAndPack
:
3833 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3837 /*----------------------------------------------------------------------------
3838 | Returns the result of adding the double-precision floating-point values `a'
3839 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3840 | Binary Floating-Point Arithmetic.
3841 *----------------------------------------------------------------------------*/
3843 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3846 a
= float64_squash_input_denormal(a STATUS_VAR
);
3847 b
= float64_squash_input_denormal(b STATUS_VAR
);
3849 aSign
= extractFloat64Sign( a
);
3850 bSign
= extractFloat64Sign( b
);
3851 if ( aSign
== bSign
) {
3852 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3855 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the double-precision floating-point values
3862 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3863 | for Binary Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3866 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3869 a
= float64_squash_input_denormal(a STATUS_VAR
);
3870 b
= float64_squash_input_denormal(b STATUS_VAR
);
3872 aSign
= extractFloat64Sign( a
);
3873 bSign
= extractFloat64Sign( b
);
3874 if ( aSign
== bSign
) {
3875 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3878 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3883 /*----------------------------------------------------------------------------
3884 | Returns the result of multiplying the double-precision floating-point values
3885 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3886 | for Binary Floating-Point Arithmetic.
3887 *----------------------------------------------------------------------------*/
3889 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3891 flag aSign
, bSign
, zSign
;
3892 int_fast16_t aExp
, bExp
, zExp
;
3893 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3895 a
= float64_squash_input_denormal(a STATUS_VAR
);
3896 b
= float64_squash_input_denormal(b STATUS_VAR
);
3898 aSig
= extractFloat64Frac( a
);
3899 aExp
= extractFloat64Exp( a
);
3900 aSign
= extractFloat64Sign( a
);
3901 bSig
= extractFloat64Frac( b
);
3902 bExp
= extractFloat64Exp( b
);
3903 bSign
= extractFloat64Sign( b
);
3904 zSign
= aSign
^ bSign
;
3905 if ( aExp
== 0x7FF ) {
3906 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3907 return propagateFloat64NaN( a
, b STATUS_VAR
);
3909 if ( ( bExp
| bSig
) == 0 ) {
3910 float_raise( float_flag_invalid STATUS_VAR
);
3911 return float64_default_nan
;
3913 return packFloat64( zSign
, 0x7FF, 0 );
3915 if ( bExp
== 0x7FF ) {
3916 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3917 if ( ( aExp
| aSig
) == 0 ) {
3918 float_raise( float_flag_invalid STATUS_VAR
);
3919 return float64_default_nan
;
3921 return packFloat64( zSign
, 0x7FF, 0 );
3924 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3925 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3928 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3929 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3931 zExp
= aExp
+ bExp
- 0x3FF;
3932 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3933 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3934 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3935 zSig0
|= ( zSig1
!= 0 );
3936 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3940 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3944 /*----------------------------------------------------------------------------
3945 | Returns the result of dividing the double-precision floating-point value `a'
3946 | by the corresponding value `b'. The operation is performed according to
3947 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948 *----------------------------------------------------------------------------*/
3950 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3952 flag aSign
, bSign
, zSign
;
3953 int_fast16_t aExp
, bExp
, zExp
;
3954 uint64_t aSig
, bSig
, zSig
;
3955 uint64_t rem0
, rem1
;
3956 uint64_t term0
, term1
;
3957 a
= float64_squash_input_denormal(a STATUS_VAR
);
3958 b
= float64_squash_input_denormal(b STATUS_VAR
);
3960 aSig
= extractFloat64Frac( a
);
3961 aExp
= extractFloat64Exp( a
);
3962 aSign
= extractFloat64Sign( a
);
3963 bSig
= extractFloat64Frac( b
);
3964 bExp
= extractFloat64Exp( b
);
3965 bSign
= extractFloat64Sign( b
);
3966 zSign
= aSign
^ bSign
;
3967 if ( aExp
== 0x7FF ) {
3968 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3969 if ( bExp
== 0x7FF ) {
3970 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3971 float_raise( float_flag_invalid STATUS_VAR
);
3972 return float64_default_nan
;
3974 return packFloat64( zSign
, 0x7FF, 0 );
3976 if ( bExp
== 0x7FF ) {
3977 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3978 return packFloat64( zSign
, 0, 0 );
3982 if ( ( aExp
| aSig
) == 0 ) {
3983 float_raise( float_flag_invalid STATUS_VAR
);
3984 return float64_default_nan
;
3986 float_raise( float_flag_divbyzero STATUS_VAR
);
3987 return packFloat64( zSign
, 0x7FF, 0 );
3989 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3992 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3993 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3995 zExp
= aExp
- bExp
+ 0x3FD;
3996 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3997 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3998 if ( bSig
<= ( aSig
+ aSig
) ) {
4002 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
4003 if ( ( zSig
& 0x1FF ) <= 2 ) {
4004 mul64To128( bSig
, zSig
, &term0
, &term1
);
4005 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4006 while ( (int64_t) rem0
< 0 ) {
4008 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4010 zSig
|= ( rem1
!= 0 );
4012 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
4016 /*----------------------------------------------------------------------------
4017 | Returns the remainder of the double-precision floating-point value `a'
4018 | with respect to the corresponding value `b'. The operation is performed
4019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4020 *----------------------------------------------------------------------------*/
4022 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
4025 int_fast16_t aExp
, bExp
, expDiff
;
4026 uint64_t aSig
, bSig
;
4027 uint64_t q
, alternateASig
;
4030 a
= float64_squash_input_denormal(a STATUS_VAR
);
4031 b
= float64_squash_input_denormal(b STATUS_VAR
);
4032 aSig
= extractFloat64Frac( a
);
4033 aExp
= extractFloat64Exp( a
);
4034 aSign
= extractFloat64Sign( a
);
4035 bSig
= extractFloat64Frac( b
);
4036 bExp
= extractFloat64Exp( b
);
4037 if ( aExp
== 0x7FF ) {
4038 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
4039 return propagateFloat64NaN( a
, b STATUS_VAR
);
4041 float_raise( float_flag_invalid STATUS_VAR
);
4042 return float64_default_nan
;
4044 if ( bExp
== 0x7FF ) {
4045 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
4050 float_raise( float_flag_invalid STATUS_VAR
);
4051 return float64_default_nan
;
4053 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
4056 if ( aSig
== 0 ) return a
;
4057 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4059 expDiff
= aExp
- bExp
;
4060 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
4061 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
4062 if ( expDiff
< 0 ) {
4063 if ( expDiff
< -1 ) return a
;
4066 q
= ( bSig
<= aSig
);
4067 if ( q
) aSig
-= bSig
;
4069 while ( 0 < expDiff
) {
4070 q
= estimateDiv128To64( aSig
, 0, bSig
);
4071 q
= ( 2 < q
) ? q
- 2 : 0;
4072 aSig
= - ( ( bSig
>>2 ) * q
);
4076 if ( 0 < expDiff
) {
4077 q
= estimateDiv128To64( aSig
, 0, bSig
);
4078 q
= ( 2 < q
) ? q
- 2 : 0;
4081 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
4088 alternateASig
= aSig
;
4091 } while ( 0 <= (int64_t) aSig
);
4092 sigMean
= aSig
+ alternateASig
;
4093 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
4094 aSig
= alternateASig
;
4096 zSign
= ( (int64_t) aSig
< 0 );
4097 if ( zSign
) aSig
= - aSig
;
4098 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
4102 /*----------------------------------------------------------------------------
4103 | Returns the result of multiplying the double-precision floating-point values
4104 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4105 | multiplication. The operation is performed according to the IEC/IEEE
4106 | Standard for Binary Floating-Point Arithmetic 754-2008.
4107 | The flags argument allows the caller to select negation of the
4108 | addend, the intermediate product, or the final result. (The difference
4109 | between this and having the caller do a separate negation is that negating
4110 | externally will flip the sign bit on NaNs.)
4111 *----------------------------------------------------------------------------*/
4113 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags STATUS_PARAM
)
4115 flag aSign
, bSign
, cSign
, zSign
;
4116 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
4117 uint64_t aSig
, bSig
, cSig
;
4118 flag pInf
, pZero
, pSign
;
4119 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
4121 flag signflip
, infzero
;
4123 a
= float64_squash_input_denormal(a STATUS_VAR
);
4124 b
= float64_squash_input_denormal(b STATUS_VAR
);
4125 c
= float64_squash_input_denormal(c STATUS_VAR
);
4126 aSig
= extractFloat64Frac(a
);
4127 aExp
= extractFloat64Exp(a
);
4128 aSign
= extractFloat64Sign(a
);
4129 bSig
= extractFloat64Frac(b
);
4130 bExp
= extractFloat64Exp(b
);
4131 bSign
= extractFloat64Sign(b
);
4132 cSig
= extractFloat64Frac(c
);
4133 cExp
= extractFloat64Exp(c
);
4134 cSign
= extractFloat64Sign(c
);
4136 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
4137 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
4139 /* It is implementation-defined whether the cases of (0,inf,qnan)
4140 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4141 * they return if they do), so we have to hand this information
4142 * off to the target-specific pick-a-NaN routine.
4144 if (((aExp
== 0x7ff) && aSig
) ||
4145 ((bExp
== 0x7ff) && bSig
) ||
4146 ((cExp
== 0x7ff) && cSig
)) {
4147 return propagateFloat64MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
4151 float_raise(float_flag_invalid STATUS_VAR
);
4152 return float64_default_nan
;
4155 if (flags
& float_muladd_negate_c
) {
4159 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
4161 /* Work out the sign and type of the product */
4162 pSign
= aSign
^ bSign
;
4163 if (flags
& float_muladd_negate_product
) {
4166 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
4167 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
4169 if (cExp
== 0x7ff) {
4170 if (pInf
&& (pSign
^ cSign
)) {
4171 /* addition of opposite-signed infinities => InvalidOperation */
4172 float_raise(float_flag_invalid STATUS_VAR
);
4173 return float64_default_nan
;
4175 /* Otherwise generate an infinity of the same sign */
4176 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
4180 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
4186 /* Adding two exact zeroes */
4187 if (pSign
== cSign
) {
4189 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
4194 return packFloat64(zSign
^ signflip
, 0, 0);
4196 /* Exact zero plus a denorm */
4197 if (STATUS(flush_to_zero
)) {
4198 float_raise(float_flag_output_denormal STATUS_VAR
);
4199 return packFloat64(cSign
^ signflip
, 0, 0);
4202 /* Zero plus something non-zero : just return the something */
4203 if (flags
& float_muladd_halve_result
) {
4205 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4207 /* Subtract one to halve, and one again because roundAndPackFloat64
4208 * wants one less than the true exponent.
4211 cSig
= (cSig
| 0x0010000000000000ULL
) << 10;
4212 return roundAndPackFloat64(cSign
^ signflip
, cExp
, cSig STATUS_VAR
);
4214 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4218 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4221 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4224 /* Calculate the actual result a * b + c */
4226 /* Multiply first; this is easy. */
4227 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4228 * because we want the true exponent, not the "one-less-than"
4229 * flavour that roundAndPackFloat64() takes.
4231 pExp
= aExp
+ bExp
- 0x3fe;
4232 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4233 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4234 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4235 if ((int64_t)(pSig0
<< 1) >= 0) {
4236 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4240 zSign
= pSign
^ signflip
;
4242 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4243 * bit in position 126.
4247 /* Throw out the special case of c being an exact zero now */
4248 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4249 if (flags
& float_muladd_halve_result
) {
4252 return roundAndPackFloat64(zSign
, pExp
- 1,
4255 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4258 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4259 * significand of the addend, with the explicit bit in position 126.
4261 cSig0
= cSig
<< (126 - 64 - 52);
4263 cSig0
|= LIT64(0x4000000000000000);
4264 expDiff
= pExp
- cExp
;
4266 if (pSign
== cSign
) {
4269 /* scale c to match p */
4270 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4272 } else if (expDiff
< 0) {
4273 /* scale p to match c */
4274 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4277 /* no scaling needed */
4280 /* Add significands and make sure explicit bit ends up in posn 126 */
4281 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4282 if ((int64_t)zSig0
< 0) {
4283 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4287 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4288 if (flags
& float_muladd_halve_result
) {
4291 return roundAndPackFloat64(zSign
, zExp
, zSig1 STATUS_VAR
);
4295 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4296 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4298 } else if (expDiff
< 0) {
4299 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4300 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4305 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4306 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4307 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4308 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4313 if (STATUS(float_rounding_mode
) == float_round_down
) {
4316 return packFloat64(zSign
, 0, 0);
4320 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4321 * starting with the significand in a pair of uint64_t.
4324 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4325 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4331 shiftcount
= countLeadingZeros64(zSig1
);
4332 if (shiftcount
== 0) {
4333 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4337 zSig0
= zSig1
<< shiftcount
;
4338 zExp
-= (shiftcount
+ 64);
4341 if (flags
& float_muladd_halve_result
) {
4344 return roundAndPackFloat64(zSign
, zExp
, zSig0 STATUS_VAR
);
4348 /*----------------------------------------------------------------------------
4349 | Returns the square root of the double-precision floating-point value `a'.
4350 | The operation is performed according to the IEC/IEEE Standard for Binary
4351 | Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4354 float64
float64_sqrt( float64 a STATUS_PARAM
)
4357 int_fast16_t aExp
, zExp
;
4358 uint64_t aSig
, zSig
, doubleZSig
;
4359 uint64_t rem0
, rem1
, term0
, term1
;
4360 a
= float64_squash_input_denormal(a STATUS_VAR
);
4362 aSig
= extractFloat64Frac( a
);
4363 aExp
= extractFloat64Exp( a
);
4364 aSign
= extractFloat64Sign( a
);
4365 if ( aExp
== 0x7FF ) {
4366 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
4367 if ( ! aSign
) return a
;
4368 float_raise( float_flag_invalid STATUS_VAR
);
4369 return float64_default_nan
;
4372 if ( ( aExp
| aSig
) == 0 ) return a
;
4373 float_raise( float_flag_invalid STATUS_VAR
);
4374 return float64_default_nan
;
4377 if ( aSig
== 0 ) return float64_zero
;
4378 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4380 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4381 aSig
|= LIT64( 0x0010000000000000 );
4382 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4383 aSig
<<= 9 - ( aExp
& 1 );
4384 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4385 if ( ( zSig
& 0x1FF ) <= 5 ) {
4386 doubleZSig
= zSig
<<1;
4387 mul64To128( zSig
, zSig
, &term0
, &term1
);
4388 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4389 while ( (int64_t) rem0
< 0 ) {
4392 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4394 zSig
|= ( ( rem0
| rem1
) != 0 );
4396 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
4400 /*----------------------------------------------------------------------------
4401 | Returns the binary log of the double-precision floating-point value `a'.
4402 | The operation is performed according to the IEC/IEEE Standard for Binary
4403 | Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 float64
float64_log2( float64 a STATUS_PARAM
)
4409 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4410 a
= float64_squash_input_denormal(a STATUS_VAR
);
4412 aSig
= extractFloat64Frac( a
);
4413 aExp
= extractFloat64Exp( a
);
4414 aSign
= extractFloat64Sign( a
);
4417 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4418 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4421 float_raise( float_flag_invalid STATUS_VAR
);
4422 return float64_default_nan
;
4424 if ( aExp
== 0x7FF ) {
4425 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
4430 aSig
|= LIT64( 0x0010000000000000 );
4432 zSig
= (uint64_t)aExp
<< 52;
4433 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4434 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4435 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4436 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4444 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point value `a' is equal to the
4449 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4450 | if either operand is a NaN. Otherwise, the comparison is performed
4451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4454 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
4457 a
= float64_squash_input_denormal(a STATUS_VAR
);
4458 b
= float64_squash_input_denormal(b STATUS_VAR
);
4460 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4461 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4463 float_raise( float_flag_invalid STATUS_VAR
);
4466 av
= float64_val(a
);
4467 bv
= float64_val(b
);
4468 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4472 /*----------------------------------------------------------------------------
4473 | Returns 1 if the double-precision floating-point value `a' is less than or
4474 | equal to the corresponding value `b', and 0 otherwise. The invalid
4475 | exception is raised if either operand is a NaN. The comparison is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4479 int float64_le( float64 a
, float64 b STATUS_PARAM
)
4483 a
= float64_squash_input_denormal(a STATUS_VAR
);
4484 b
= float64_squash_input_denormal(b STATUS_VAR
);
4486 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4487 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4489 float_raise( float_flag_invalid STATUS_VAR
);
4492 aSign
= extractFloat64Sign( a
);
4493 bSign
= extractFloat64Sign( b
);
4494 av
= float64_val(a
);
4495 bv
= float64_val(b
);
4496 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4497 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4501 /*----------------------------------------------------------------------------
4502 | Returns 1 if the double-precision floating-point value `a' is less than
4503 | the corresponding value `b', and 0 otherwise. The invalid exception is
4504 | raised if either operand is a NaN. The comparison is performed according
4505 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4508 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
4513 a
= float64_squash_input_denormal(a STATUS_VAR
);
4514 b
= float64_squash_input_denormal(b STATUS_VAR
);
4515 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4516 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4518 float_raise( float_flag_invalid STATUS_VAR
);
4521 aSign
= extractFloat64Sign( a
);
4522 bSign
= extractFloat64Sign( b
);
4523 av
= float64_val(a
);
4524 bv
= float64_val(b
);
4525 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4526 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4532 | be compared, and 0 otherwise. The invalid exception is raised if either
4533 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4534 | Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
4539 a
= float64_squash_input_denormal(a STATUS_VAR
);
4540 b
= float64_squash_input_denormal(b STATUS_VAR
);
4542 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4543 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4545 float_raise( float_flag_invalid STATUS_VAR
);
4551 /*----------------------------------------------------------------------------
4552 | Returns 1 if the double-precision floating-point value `a' is equal to the
4553 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4554 | exception.The comparison is performed according to the IEC/IEEE Standard
4555 | for Binary Floating-Point Arithmetic.
4556 *----------------------------------------------------------------------------*/
4558 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
4561 a
= float64_squash_input_denormal(a STATUS_VAR
);
4562 b
= float64_squash_input_denormal(b STATUS_VAR
);
4564 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4565 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4567 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4568 float_raise( float_flag_invalid STATUS_VAR
);
4572 av
= float64_val(a
);
4573 bv
= float64_val(b
);
4574 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4578 /*----------------------------------------------------------------------------
4579 | Returns 1 if the double-precision floating-point value `a' is less than or
4580 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4581 | cause an exception. Otherwise, the comparison is performed according to the
4582 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4583 *----------------------------------------------------------------------------*/
4585 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
4589 a
= float64_squash_input_denormal(a STATUS_VAR
);
4590 b
= float64_squash_input_denormal(b STATUS_VAR
);
4592 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4593 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4595 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4596 float_raise( float_flag_invalid STATUS_VAR
);
4600 aSign
= extractFloat64Sign( a
);
4601 bSign
= extractFloat64Sign( b
);
4602 av
= float64_val(a
);
4603 bv
= float64_val(b
);
4604 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4605 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4609 /*----------------------------------------------------------------------------
4610 | Returns 1 if the double-precision floating-point value `a' is less than
4611 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4612 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4613 | Standard for Binary Floating-Point Arithmetic.
4614 *----------------------------------------------------------------------------*/
4616 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
4620 a
= float64_squash_input_denormal(a STATUS_VAR
);
4621 b
= float64_squash_input_denormal(b STATUS_VAR
);
4623 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4624 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4626 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4627 float_raise( float_flag_invalid STATUS_VAR
);
4631 aSign
= extractFloat64Sign( a
);
4632 bSign
= extractFloat64Sign( b
);
4633 av
= float64_val(a
);
4634 bv
= float64_val(b
);
4635 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4636 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4640 /*----------------------------------------------------------------------------
4641 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4642 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4643 | comparison is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic.
4645 *----------------------------------------------------------------------------*/
4647 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
4649 a
= float64_squash_input_denormal(a STATUS_VAR
);
4650 b
= float64_squash_input_denormal(b STATUS_VAR
);
4652 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4653 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4655 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4656 float_raise( float_flag_invalid STATUS_VAR
);
4663 /*----------------------------------------------------------------------------
4664 | Returns the result of converting the extended double-precision floating-
4665 | point value `a' to the 32-bit two's complement integer format. The
4666 | conversion is performed according to the IEC/IEEE Standard for Binary
4667 | Floating-Point Arithmetic---which means in particular that the conversion
4668 | is rounded according to the current rounding mode. If `a' is a NaN, the
4669 | largest positive integer is returned. Otherwise, if the conversion
4670 | overflows, the largest integer with the same sign as `a' is returned.
4671 *----------------------------------------------------------------------------*/
4673 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
4676 int32 aExp
, shiftCount
;
4679 aSig
= extractFloatx80Frac( a
);
4680 aExp
= extractFloatx80Exp( a
);
4681 aSign
= extractFloatx80Sign( a
);
4682 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4683 shiftCount
= 0x4037 - aExp
;
4684 if ( shiftCount
<= 0 ) shiftCount
= 1;
4685 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4686 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the 32-bit two's complement integer format. The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic, except that the conversion is always rounded
4695 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4696 | Otherwise, if the conversion overflows, the largest integer with the same
4697 | sign as `a' is returned.
4698 *----------------------------------------------------------------------------*/
4700 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
4703 int32 aExp
, shiftCount
;
4704 uint64_t aSig
, savedASig
;
4707 aSig
= extractFloatx80Frac( a
);
4708 aExp
= extractFloatx80Exp( a
);
4709 aSign
= extractFloatx80Sign( a
);
4710 if ( 0x401E < aExp
) {
4711 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4714 else if ( aExp
< 0x3FFF ) {
4715 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4718 shiftCount
= 0x403E - aExp
;
4720 aSig
>>= shiftCount
;
4722 if ( aSign
) z
= - z
;
4723 if ( ( z
< 0 ) ^ aSign
) {
4725 float_raise( float_flag_invalid STATUS_VAR
);
4726 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4728 if ( ( aSig
<<shiftCount
) != savedASig
) {
4729 STATUS(float_exception_flags
) |= float_flag_inexact
;
4735 /*----------------------------------------------------------------------------
4736 | Returns the result of converting the extended double-precision floating-
4737 | point value `a' to the 64-bit two's complement integer format. The
4738 | conversion is performed according to the IEC/IEEE Standard for Binary
4739 | Floating-Point Arithmetic---which means in particular that the conversion
4740 | is rounded according to the current rounding mode. If `a' is a NaN,
4741 | the largest positive integer is returned. Otherwise, if the conversion
4742 | overflows, the largest integer with the same sign as `a' is returned.
4743 *----------------------------------------------------------------------------*/
4745 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
4748 int32 aExp
, shiftCount
;
4749 uint64_t aSig
, aSigExtra
;
4751 aSig
= extractFloatx80Frac( a
);
4752 aExp
= extractFloatx80Exp( a
);
4753 aSign
= extractFloatx80Sign( a
);
4754 shiftCount
= 0x403E - aExp
;
4755 if ( shiftCount
<= 0 ) {
4757 float_raise( float_flag_invalid STATUS_VAR
);
4759 || ( ( aExp
== 0x7FFF )
4760 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4762 return LIT64( 0x7FFFFFFFFFFFFFFF );
4764 return (int64_t) LIT64( 0x8000000000000000 );
4769 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4771 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
4775 /*----------------------------------------------------------------------------
4776 | Returns the result of converting the extended double-precision floating-
4777 | point value `a' to the 64-bit two's complement integer format. The
4778 | conversion is performed according to the IEC/IEEE Standard for Binary
4779 | Floating-Point Arithmetic, except that the conversion is always rounded
4780 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4781 | Otherwise, if the conversion overflows, the largest integer with the same
4782 | sign as `a' is returned.
4783 *----------------------------------------------------------------------------*/
4785 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
4788 int32 aExp
, shiftCount
;
4792 aSig
= extractFloatx80Frac( a
);
4793 aExp
= extractFloatx80Exp( a
);
4794 aSign
= extractFloatx80Sign( a
);
4795 shiftCount
= aExp
- 0x403E;
4796 if ( 0 <= shiftCount
) {
4797 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4798 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4799 float_raise( float_flag_invalid STATUS_VAR
);
4800 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4801 return LIT64( 0x7FFFFFFFFFFFFFFF );
4804 return (int64_t) LIT64( 0x8000000000000000 );
4806 else if ( aExp
< 0x3FFF ) {
4807 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4810 z
= aSig
>>( - shiftCount
);
4811 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4812 STATUS(float_exception_flags
) |= float_flag_inexact
;
4814 if ( aSign
) z
= - z
;
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the extended double-precision floating-
4821 | point value `a' to the single-precision floating-point format. The
4822 | conversion is performed according to the IEC/IEEE Standard for Binary
4823 | Floating-Point Arithmetic.
4824 *----------------------------------------------------------------------------*/
4826 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
4832 aSig
= extractFloatx80Frac( a
);
4833 aExp
= extractFloatx80Exp( a
);
4834 aSign
= extractFloatx80Sign( a
);
4835 if ( aExp
== 0x7FFF ) {
4836 if ( (uint64_t) ( aSig
<<1 ) ) {
4837 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4839 return packFloat32( aSign
, 0xFF, 0 );
4841 shift64RightJamming( aSig
, 33, &aSig
);
4842 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4843 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the extended double-precision floating-
4849 | point value `a' to the double-precision floating-point format. The
4850 | conversion is performed according to the IEC/IEEE Standard for Binary
4851 | Floating-Point Arithmetic.
4852 *----------------------------------------------------------------------------*/
4854 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
4858 uint64_t aSig
, zSig
;
4860 aSig
= extractFloatx80Frac( a
);
4861 aExp
= extractFloatx80Exp( a
);
4862 aSign
= extractFloatx80Sign( a
);
4863 if ( aExp
== 0x7FFF ) {
4864 if ( (uint64_t) ( aSig
<<1 ) ) {
4865 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4867 return packFloat64( aSign
, 0x7FF, 0 );
4869 shift64RightJamming( aSig
, 1, &zSig
);
4870 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4871 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
4875 /*----------------------------------------------------------------------------
4876 | Returns the result of converting the extended double-precision floating-
4877 | point value `a' to the quadruple-precision floating-point format. The
4878 | conversion is performed according to the IEC/IEEE Standard for Binary
4879 | Floating-Point Arithmetic.
4880 *----------------------------------------------------------------------------*/
4882 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4886 uint64_t aSig
, zSig0
, zSig1
;
4888 aSig
= extractFloatx80Frac( a
);
4889 aExp
= extractFloatx80Exp( a
);
4890 aSign
= extractFloatx80Sign( a
);
4891 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4892 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4894 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4895 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4899 /*----------------------------------------------------------------------------
4900 | Rounds the extended double-precision floating-point value `a' to an integer,
4901 | and returns the result as an extended quadruple-precision floating-point
4902 | value. The operation is performed according to the IEC/IEEE Standard for
4903 | Binary Floating-Point Arithmetic.
4904 *----------------------------------------------------------------------------*/
4906 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4910 uint64_t lastBitMask
, roundBitsMask
;
4913 aExp
= extractFloatx80Exp( a
);
4914 if ( 0x403E <= aExp
) {
4915 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4916 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4920 if ( aExp
< 0x3FFF ) {
4922 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4925 STATUS(float_exception_flags
) |= float_flag_inexact
;
4926 aSign
= extractFloatx80Sign( a
);
4927 switch ( STATUS(float_rounding_mode
) ) {
4928 case float_round_nearest_even
:
4929 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4932 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4935 case float_round_ties_away
:
4936 if (aExp
== 0x3FFE) {
4937 return packFloatx80(aSign
, 0x3FFF, LIT64(0x8000000000000000));
4940 case float_round_down
:
4943 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4944 : packFloatx80( 0, 0, 0 );
4945 case float_round_up
:
4947 aSign
? packFloatx80( 1, 0, 0 )
4948 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4950 return packFloatx80( aSign
, 0, 0 );
4953 lastBitMask
<<= 0x403E - aExp
;
4954 roundBitsMask
= lastBitMask
- 1;
4956 switch (STATUS(float_rounding_mode
)) {
4957 case float_round_nearest_even
:
4958 z
.low
+= lastBitMask
>>1;
4959 if ((z
.low
& roundBitsMask
) == 0) {
4960 z
.low
&= ~lastBitMask
;
4963 case float_round_ties_away
:
4964 z
.low
+= lastBitMask
>> 1;
4966 case float_round_to_zero
:
4968 case float_round_up
:
4969 if (!extractFloatx80Sign(z
)) {
4970 z
.low
+= roundBitsMask
;
4973 case float_round_down
:
4974 if (extractFloatx80Sign(z
)) {
4975 z
.low
+= roundBitsMask
;
4981 z
.low
&= ~ roundBitsMask
;
4984 z
.low
= LIT64( 0x8000000000000000 );
4986 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of adding the absolute values of the extended double-
4993 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4994 | negated before being returned. `zSign' is ignored if the result is a NaN.
4995 | The addition is performed according to the IEC/IEEE Standard for Binary
4996 | Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4999 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
5001 int32 aExp
, bExp
, zExp
;
5002 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5005 aSig
= extractFloatx80Frac( a
);
5006 aExp
= extractFloatx80Exp( a
);
5007 bSig
= extractFloatx80Frac( b
);
5008 bExp
= extractFloatx80Exp( b
);
5009 expDiff
= aExp
- bExp
;
5010 if ( 0 < expDiff
) {
5011 if ( aExp
== 0x7FFF ) {
5012 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5015 if ( bExp
== 0 ) --expDiff
;
5016 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5019 else if ( expDiff
< 0 ) {
5020 if ( bExp
== 0x7FFF ) {
5021 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5022 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5024 if ( aExp
== 0 ) ++expDiff
;
5025 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5029 if ( aExp
== 0x7FFF ) {
5030 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5031 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5036 zSig0
= aSig
+ bSig
;
5038 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
5044 zSig0
= aSig
+ bSig
;
5045 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
5047 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5048 zSig0
|= LIT64( 0x8000000000000000 );
5052 roundAndPackFloatx80(
5053 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5057 /*----------------------------------------------------------------------------
5058 | Returns the result of subtracting the absolute values of the extended
5059 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5060 | difference is negated before being returned. `zSign' is ignored if the
5061 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5062 | Standard for Binary Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5065 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
5067 int32 aExp
, bExp
, zExp
;
5068 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5072 aSig
= extractFloatx80Frac( a
);
5073 aExp
= extractFloatx80Exp( a
);
5074 bSig
= extractFloatx80Frac( b
);
5075 bExp
= extractFloatx80Exp( b
);
5076 expDiff
= aExp
- bExp
;
5077 if ( 0 < expDiff
) goto aExpBigger
;
5078 if ( expDiff
< 0 ) goto bExpBigger
;
5079 if ( aExp
== 0x7FFF ) {
5080 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
5081 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5083 float_raise( float_flag_invalid STATUS_VAR
);
5084 z
.low
= floatx80_default_nan_low
;
5085 z
.high
= floatx80_default_nan_high
;
5093 if ( bSig
< aSig
) goto aBigger
;
5094 if ( aSig
< bSig
) goto bBigger
;
5095 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
5097 if ( bExp
== 0x7FFF ) {
5098 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5099 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5101 if ( aExp
== 0 ) ++expDiff
;
5102 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
5104 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
5107 goto normalizeRoundAndPack
;
5109 if ( aExp
== 0x7FFF ) {
5110 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5113 if ( bExp
== 0 ) --expDiff
;
5114 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
5116 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
5118 normalizeRoundAndPack
:
5120 normalizeRoundAndPackFloatx80(
5121 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5131 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
5135 aSign
= extractFloatx80Sign( a
);
5136 bSign
= extractFloatx80Sign( b
);
5137 if ( aSign
== bSign
) {
5138 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5141 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5146 /*----------------------------------------------------------------------------
5147 | Returns the result of subtracting the extended double-precision floating-
5148 | point values `a' and `b'. The operation is performed according to the
5149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5152 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
5156 aSign
= extractFloatx80Sign( a
);
5157 bSign
= extractFloatx80Sign( b
);
5158 if ( aSign
== bSign
) {
5159 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5162 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of multiplying the extended double-precision floating-
5169 | point values `a' and `b'. The operation is performed according to the
5170 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
5175 flag aSign
, bSign
, zSign
;
5176 int32 aExp
, bExp
, zExp
;
5177 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5180 aSig
= extractFloatx80Frac( a
);
5181 aExp
= extractFloatx80Exp( a
);
5182 aSign
= extractFloatx80Sign( a
);
5183 bSig
= extractFloatx80Frac( b
);
5184 bExp
= extractFloatx80Exp( b
);
5185 bSign
= extractFloatx80Sign( b
);
5186 zSign
= aSign
^ bSign
;
5187 if ( aExp
== 0x7FFF ) {
5188 if ( (uint64_t) ( aSig
<<1 )
5189 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5190 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5192 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
5193 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5195 if ( bExp
== 0x7FFF ) {
5196 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5197 if ( ( aExp
| aSig
) == 0 ) {
5199 float_raise( float_flag_invalid STATUS_VAR
);
5200 z
.low
= floatx80_default_nan_low
;
5201 z
.high
= floatx80_default_nan_high
;
5204 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5207 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5208 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5211 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5212 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5214 zExp
= aExp
+ bExp
- 0x3FFE;
5215 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
5216 if ( 0 < (int64_t) zSig0
) {
5217 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
5221 roundAndPackFloatx80(
5222 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5226 /*----------------------------------------------------------------------------
5227 | Returns the result of dividing the extended double-precision floating-point
5228 | value `a' by the corresponding value `b'. The operation is performed
5229 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5230 *----------------------------------------------------------------------------*/
5232 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
5234 flag aSign
, bSign
, zSign
;
5235 int32 aExp
, bExp
, zExp
;
5236 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5237 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5240 aSig
= extractFloatx80Frac( a
);
5241 aExp
= extractFloatx80Exp( a
);
5242 aSign
= extractFloatx80Sign( a
);
5243 bSig
= extractFloatx80Frac( b
);
5244 bExp
= extractFloatx80Exp( b
);
5245 bSign
= extractFloatx80Sign( b
);
5246 zSign
= aSign
^ bSign
;
5247 if ( aExp
== 0x7FFF ) {
5248 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5249 if ( bExp
== 0x7FFF ) {
5250 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5253 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5255 if ( bExp
== 0x7FFF ) {
5256 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5257 return packFloatx80( zSign
, 0, 0 );
5261 if ( ( aExp
| aSig
) == 0 ) {
5263 float_raise( float_flag_invalid STATUS_VAR
);
5264 z
.low
= floatx80_default_nan_low
;
5265 z
.high
= floatx80_default_nan_high
;
5268 float_raise( float_flag_divbyzero STATUS_VAR
);
5269 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5271 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5274 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5275 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5277 zExp
= aExp
- bExp
+ 0x3FFE;
5279 if ( bSig
<= aSig
) {
5280 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5283 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5284 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5285 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5286 while ( (int64_t) rem0
< 0 ) {
5288 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5290 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5291 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5292 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5293 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5294 while ( (int64_t) rem1
< 0 ) {
5296 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5298 zSig1
|= ( ( rem1
| rem2
) != 0 );
5301 roundAndPackFloatx80(
5302 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5306 /*----------------------------------------------------------------------------
5307 | Returns the remainder of the extended double-precision floating-point value
5308 | `a' with respect to the corresponding value `b'. The operation is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5312 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
5315 int32 aExp
, bExp
, expDiff
;
5316 uint64_t aSig0
, aSig1
, bSig
;
5317 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5320 aSig0
= extractFloatx80Frac( a
);
5321 aExp
= extractFloatx80Exp( a
);
5322 aSign
= extractFloatx80Sign( a
);
5323 bSig
= extractFloatx80Frac( b
);
5324 bExp
= extractFloatx80Exp( b
);
5325 if ( aExp
== 0x7FFF ) {
5326 if ( (uint64_t) ( aSig0
<<1 )
5327 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5328 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5332 if ( bExp
== 0x7FFF ) {
5333 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5339 float_raise( float_flag_invalid STATUS_VAR
);
5340 z
.low
= floatx80_default_nan_low
;
5341 z
.high
= floatx80_default_nan_high
;
5344 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5347 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5348 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5350 bSig
|= LIT64( 0x8000000000000000 );
5352 expDiff
= aExp
- bExp
;
5354 if ( expDiff
< 0 ) {
5355 if ( expDiff
< -1 ) return a
;
5356 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5359 q
= ( bSig
<= aSig0
);
5360 if ( q
) aSig0
-= bSig
;
5362 while ( 0 < expDiff
) {
5363 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5364 q
= ( 2 < q
) ? q
- 2 : 0;
5365 mul64To128( bSig
, q
, &term0
, &term1
);
5366 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5367 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5371 if ( 0 < expDiff
) {
5372 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5373 q
= ( 2 < q
) ? q
- 2 : 0;
5375 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5376 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5377 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5378 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5380 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5387 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5388 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5389 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5392 aSig0
= alternateASig0
;
5393 aSig1
= alternateASig1
;
5397 normalizeRoundAndPackFloatx80(
5398 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
5402 /*----------------------------------------------------------------------------
5403 | Returns the square root of the extended double-precision floating-point
5404 | value `a'. The operation is performed according to the IEC/IEEE Standard
5405 | for Binary Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5408 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
5412 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5413 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5416 aSig0
= extractFloatx80Frac( a
);
5417 aExp
= extractFloatx80Exp( a
);
5418 aSign
= extractFloatx80Sign( a
);
5419 if ( aExp
== 0x7FFF ) {
5420 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
5421 if ( ! aSign
) return a
;
5425 if ( ( aExp
| aSig0
) == 0 ) return a
;
5427 float_raise( float_flag_invalid STATUS_VAR
);
5428 z
.low
= floatx80_default_nan_low
;
5429 z
.high
= floatx80_default_nan_high
;
5433 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5434 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5436 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5437 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5438 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5439 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5440 doubleZSig0
= zSig0
<<1;
5441 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5442 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5443 while ( (int64_t) rem0
< 0 ) {
5446 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5448 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5449 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5450 if ( zSig1
== 0 ) zSig1
= 1;
5451 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5452 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5453 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5454 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5455 while ( (int64_t) rem1
< 0 ) {
5457 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5459 term2
|= doubleZSig0
;
5460 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5462 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5464 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5465 zSig0
|= doubleZSig0
;
5467 roundAndPackFloatx80(
5468 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
5472 /*----------------------------------------------------------------------------
5473 | Returns 1 if the extended double-precision floating-point value `a' is equal
5474 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5475 | raised if either operand is a NaN. Otherwise, the comparison is performed
5476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5477 *----------------------------------------------------------------------------*/
5479 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
5482 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5484 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5485 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5487 float_raise( float_flag_invalid STATUS_VAR
);
5492 && ( ( a
.high
== b
.high
)
5494 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5499 /*----------------------------------------------------------------------------
5500 | Returns 1 if the extended double-precision floating-point value `a' is
5501 | less than or equal to the corresponding value `b', and 0 otherwise. The
5502 | invalid exception is raised if either operand is a NaN. The comparison is
5503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5505 *----------------------------------------------------------------------------*/
5507 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
5511 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5512 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5513 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5514 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5516 float_raise( float_flag_invalid STATUS_VAR
);
5519 aSign
= extractFloatx80Sign( a
);
5520 bSign
= extractFloatx80Sign( b
);
5521 if ( aSign
!= bSign
) {
5524 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5528 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5529 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5533 /*----------------------------------------------------------------------------
5534 | Returns 1 if the extended double-precision floating-point value `a' is
5535 | less than the corresponding value `b', and 0 otherwise. The invalid
5536 | exception is raised if either operand is a NaN. The comparison is performed
5537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538 *----------------------------------------------------------------------------*/
5540 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
5544 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5545 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5546 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5547 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5549 float_raise( float_flag_invalid STATUS_VAR
);
5552 aSign
= extractFloatx80Sign( a
);
5553 bSign
= extractFloatx80Sign( b
);
5554 if ( aSign
!= bSign
) {
5557 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5561 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5562 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5566 /*----------------------------------------------------------------------------
5567 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5568 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5569 | either operand is a NaN. The comparison is performed according to the
5570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5571 *----------------------------------------------------------------------------*/
5572 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
5574 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5575 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5576 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5577 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5579 float_raise( float_flag_invalid STATUS_VAR
);
5585 /*----------------------------------------------------------------------------
5586 | Returns 1 if the extended double-precision floating-point value `a' is
5587 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5588 | cause an exception. The comparison is performed according to the IEC/IEEE
5589 | Standard for Binary Floating-Point Arithmetic.
5590 *----------------------------------------------------------------------------*/
5592 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5595 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5596 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5597 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5598 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5600 if ( floatx80_is_signaling_nan( a
)
5601 || floatx80_is_signaling_nan( b
) ) {
5602 float_raise( float_flag_invalid STATUS_VAR
);
5608 && ( ( a
.high
== b
.high
)
5610 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5615 /*----------------------------------------------------------------------------
5616 | Returns 1 if the extended double-precision floating-point value `a' is less
5617 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5618 | do not cause an exception. Otherwise, the comparison is performed according
5619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5622 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5626 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5627 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5628 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5629 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5631 if ( floatx80_is_signaling_nan( a
)
5632 || floatx80_is_signaling_nan( b
) ) {
5633 float_raise( float_flag_invalid STATUS_VAR
);
5637 aSign
= extractFloatx80Sign( a
);
5638 bSign
= extractFloatx80Sign( b
);
5639 if ( aSign
!= bSign
) {
5642 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5646 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5647 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5651 /*----------------------------------------------------------------------------
5652 | Returns 1 if the extended double-precision floating-point value `a' is less
5653 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5654 | an exception. Otherwise, the comparison is performed according to the
5655 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5656 *----------------------------------------------------------------------------*/
5658 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5662 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5663 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5664 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5665 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5667 if ( floatx80_is_signaling_nan( a
)
5668 || floatx80_is_signaling_nan( b
) ) {
5669 float_raise( float_flag_invalid STATUS_VAR
);
5673 aSign
= extractFloatx80Sign( a
);
5674 bSign
= extractFloatx80Sign( b
);
5675 if ( aSign
!= bSign
) {
5678 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5682 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5683 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5687 /*----------------------------------------------------------------------------
5688 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5689 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5690 | The comparison is performed according to the IEC/IEEE Standard for Binary
5691 | Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5693 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5695 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5696 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5697 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5698 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5700 if ( floatx80_is_signaling_nan( a
)
5701 || floatx80_is_signaling_nan( b
) ) {
5702 float_raise( float_flag_invalid STATUS_VAR
);
5709 /*----------------------------------------------------------------------------
5710 | Returns the result of converting the quadruple-precision floating-point
5711 | value `a' to the 32-bit two's complement integer format. The conversion
5712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5713 | Arithmetic---which means in particular that the conversion is rounded
5714 | according to the current rounding mode. If `a' is a NaN, the largest
5715 | positive integer is returned. Otherwise, if the conversion overflows, the
5716 | largest integer with the same sign as `a' is returned.
5717 *----------------------------------------------------------------------------*/
5719 int32
float128_to_int32( float128 a STATUS_PARAM
)
5722 int32 aExp
, shiftCount
;
5723 uint64_t aSig0
, aSig1
;
5725 aSig1
= extractFloat128Frac1( a
);
5726 aSig0
= extractFloat128Frac0( a
);
5727 aExp
= extractFloat128Exp( a
);
5728 aSign
= extractFloat128Sign( a
);
5729 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5730 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5731 aSig0
|= ( aSig1
!= 0 );
5732 shiftCount
= 0x4028 - aExp
;
5733 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5734 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 32-bit two's complement integer format. The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero. If
5743 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5744 | conversion overflows, the largest integer with the same sign as `a' is
5746 *----------------------------------------------------------------------------*/
5748 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
5751 int32 aExp
, shiftCount
;
5752 uint64_t aSig0
, aSig1
, savedASig
;
5755 aSig1
= extractFloat128Frac1( a
);
5756 aSig0
= extractFloat128Frac0( a
);
5757 aExp
= extractFloat128Exp( a
);
5758 aSign
= extractFloat128Sign( a
);
5759 aSig0
|= ( aSig1
!= 0 );
5760 if ( 0x401E < aExp
) {
5761 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5764 else if ( aExp
< 0x3FFF ) {
5765 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5768 aSig0
|= LIT64( 0x0001000000000000 );
5769 shiftCount
= 0x402F - aExp
;
5771 aSig0
>>= shiftCount
;
5773 if ( aSign
) z
= - z
;
5774 if ( ( z
< 0 ) ^ aSign
) {
5776 float_raise( float_flag_invalid STATUS_VAR
);
5777 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5779 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5780 STATUS(float_exception_flags
) |= float_flag_inexact
;
5786 /*----------------------------------------------------------------------------
5787 | Returns the result of converting the quadruple-precision floating-point
5788 | value `a' to the 64-bit two's complement integer format. The conversion
5789 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5790 | Arithmetic---which means in particular that the conversion is rounded
5791 | according to the current rounding mode. If `a' is a NaN, the largest
5792 | positive integer is returned. Otherwise, if the conversion overflows, the
5793 | largest integer with the same sign as `a' is returned.
5794 *----------------------------------------------------------------------------*/
5796 int64
float128_to_int64( float128 a STATUS_PARAM
)
5799 int32 aExp
, shiftCount
;
5800 uint64_t aSig0
, aSig1
;
5802 aSig1
= extractFloat128Frac1( a
);
5803 aSig0
= extractFloat128Frac0( a
);
5804 aExp
= extractFloat128Exp( a
);
5805 aSign
= extractFloat128Sign( a
);
5806 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5807 shiftCount
= 0x402F - aExp
;
5808 if ( shiftCount
<= 0 ) {
5809 if ( 0x403E < aExp
) {
5810 float_raise( float_flag_invalid STATUS_VAR
);
5812 || ( ( aExp
== 0x7FFF )
5813 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5816 return LIT64( 0x7FFFFFFFFFFFFFFF );
5818 return (int64_t) LIT64( 0x8000000000000000 );
5820 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5823 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5825 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of converting the quadruple-precision floating-point
5831 | value `a' to the 64-bit two's complement integer format. The conversion
5832 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833 | Arithmetic, except that the conversion is always rounded toward zero.
5834 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5835 | the conversion overflows, the largest integer with the same sign as `a' is
5837 *----------------------------------------------------------------------------*/
5839 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
5842 int32 aExp
, shiftCount
;
5843 uint64_t aSig0
, aSig1
;
5846 aSig1
= extractFloat128Frac1( a
);
5847 aSig0
= extractFloat128Frac0( a
);
5848 aExp
= extractFloat128Exp( a
);
5849 aSign
= extractFloat128Sign( a
);
5850 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5851 shiftCount
= aExp
- 0x402F;
5852 if ( 0 < shiftCount
) {
5853 if ( 0x403E <= aExp
) {
5854 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5855 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5856 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5857 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5860 float_raise( float_flag_invalid STATUS_VAR
);
5861 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5862 return LIT64( 0x7FFFFFFFFFFFFFFF );
5865 return (int64_t) LIT64( 0x8000000000000000 );
5867 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5868 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5869 STATUS(float_exception_flags
) |= float_flag_inexact
;
5873 if ( aExp
< 0x3FFF ) {
5874 if ( aExp
| aSig0
| aSig1
) {
5875 STATUS(float_exception_flags
) |= float_flag_inexact
;
5879 z
= aSig0
>>( - shiftCount
);
5881 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5882 STATUS(float_exception_flags
) |= float_flag_inexact
;
5885 if ( aSign
) z
= - z
;
5890 /*----------------------------------------------------------------------------
5891 | Returns the result of converting the quadruple-precision floating-point
5892 | value `a' to the single-precision floating-point format. The conversion
5893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5895 *----------------------------------------------------------------------------*/
5897 float32
float128_to_float32( float128 a STATUS_PARAM
)
5901 uint64_t aSig0
, aSig1
;
5904 aSig1
= extractFloat128Frac1( a
);
5905 aSig0
= extractFloat128Frac0( a
);
5906 aExp
= extractFloat128Exp( a
);
5907 aSign
= extractFloat128Sign( a
);
5908 if ( aExp
== 0x7FFF ) {
5909 if ( aSig0
| aSig1
) {
5910 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5912 return packFloat32( aSign
, 0xFF, 0 );
5914 aSig0
|= ( aSig1
!= 0 );
5915 shift64RightJamming( aSig0
, 18, &aSig0
);
5917 if ( aExp
|| zSig
) {
5921 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the double-precision floating-point format. The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 *----------------------------------------------------------------------------*/
5932 float64
float128_to_float64( float128 a STATUS_PARAM
)
5936 uint64_t aSig0
, aSig1
;
5938 aSig1
= extractFloat128Frac1( a
);
5939 aSig0
= extractFloat128Frac0( a
);
5940 aExp
= extractFloat128Exp( a
);
5941 aSign
= extractFloat128Sign( a
);
5942 if ( aExp
== 0x7FFF ) {
5943 if ( aSig0
| aSig1
) {
5944 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5946 return packFloat64( aSign
, 0x7FF, 0 );
5948 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5949 aSig0
|= ( aSig1
!= 0 );
5950 if ( aExp
|| aSig0
) {
5951 aSig0
|= LIT64( 0x4000000000000000 );
5954 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5958 /*----------------------------------------------------------------------------
5959 | Returns the result of converting the quadruple-precision floating-point
5960 | value `a' to the extended double-precision floating-point format. The
5961 | conversion is performed according to the IEC/IEEE Standard for Binary
5962 | Floating-Point Arithmetic.
5963 *----------------------------------------------------------------------------*/
5965 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5969 uint64_t aSig0
, aSig1
;
5971 aSig1
= extractFloat128Frac1( a
);
5972 aSig0
= extractFloat128Frac0( a
);
5973 aExp
= extractFloat128Exp( a
);
5974 aSign
= extractFloat128Sign( a
);
5975 if ( aExp
== 0x7FFF ) {
5976 if ( aSig0
| aSig1
) {
5977 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5979 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5982 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5983 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5986 aSig0
|= LIT64( 0x0001000000000000 );
5988 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5989 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5993 /*----------------------------------------------------------------------------
5994 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5995 | returns the result as a quadruple-precision floating-point value. The
5996 | operation is performed according to the IEC/IEEE Standard for Binary
5997 | Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
6000 float128
float128_round_to_int( float128 a STATUS_PARAM
)
6004 uint64_t lastBitMask
, roundBitsMask
;
6007 aExp
= extractFloat128Exp( a
);
6008 if ( 0x402F <= aExp
) {
6009 if ( 0x406F <= aExp
) {
6010 if ( ( aExp
== 0x7FFF )
6011 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
6013 return propagateFloat128NaN( a
, a STATUS_VAR
);
6018 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
6019 roundBitsMask
= lastBitMask
- 1;
6021 switch (STATUS(float_rounding_mode
)) {
6022 case float_round_nearest_even
:
6023 if ( lastBitMask
) {
6024 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
6025 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
6028 if ( (int64_t) z
.low
< 0 ) {
6030 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
6034 case float_round_ties_away
:
6036 add128(z
.high
, z
.low
, 0, lastBitMask
>> 1, &z
.high
, &z
.low
);
6038 if ((int64_t) z
.low
< 0) {
6043 case float_round_to_zero
:
6045 case float_round_up
:
6046 if (!extractFloat128Sign(z
)) {
6047 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6050 case float_round_down
:
6051 if (extractFloat128Sign(z
)) {
6052 add128(z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
6058 z
.low
&= ~ roundBitsMask
;
6061 if ( aExp
< 0x3FFF ) {
6062 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
6063 STATUS(float_exception_flags
) |= float_flag_inexact
;
6064 aSign
= extractFloat128Sign( a
);
6065 switch ( STATUS(float_rounding_mode
) ) {
6066 case float_round_nearest_even
:
6067 if ( ( aExp
== 0x3FFE )
6068 && ( extractFloat128Frac0( a
)
6069 | extractFloat128Frac1( a
) )
6071 return packFloat128( aSign
, 0x3FFF, 0, 0 );
6074 case float_round_ties_away
:
6075 if (aExp
== 0x3FFE) {
6076 return packFloat128(aSign
, 0x3FFF, 0, 0);
6079 case float_round_down
:
6081 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
6082 : packFloat128( 0, 0, 0, 0 );
6083 case float_round_up
:
6085 aSign
? packFloat128( 1, 0, 0, 0 )
6086 : packFloat128( 0, 0x3FFF, 0, 0 );
6088 return packFloat128( aSign
, 0, 0, 0 );
6091 lastBitMask
<<= 0x402F - aExp
;
6092 roundBitsMask
= lastBitMask
- 1;
6095 switch (STATUS(float_rounding_mode
)) {
6096 case float_round_nearest_even
:
6097 z
.high
+= lastBitMask
>>1;
6098 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
6099 z
.high
&= ~ lastBitMask
;
6102 case float_round_ties_away
:
6103 z
.high
+= lastBitMask
>>1;
6105 case float_round_to_zero
:
6107 case float_round_up
:
6108 if (!extractFloat128Sign(z
)) {
6109 z
.high
|= ( a
.low
!= 0 );
6110 z
.high
+= roundBitsMask
;
6113 case float_round_down
:
6114 if (extractFloat128Sign(z
)) {
6115 z
.high
|= (a
.low
!= 0);
6116 z
.high
+= roundBitsMask
;
6122 z
.high
&= ~ roundBitsMask
;
6124 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
6125 STATUS(float_exception_flags
) |= float_flag_inexact
;
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of adding the absolute values of the quadruple-precision
6133 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6134 | before being returned. `zSign' is ignored if the result is a NaN.
6135 | The addition is performed according to the IEC/IEEE Standard for Binary
6136 | Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6139 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
6141 int32 aExp
, bExp
, zExp
;
6142 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6145 aSig1
= extractFloat128Frac1( a
);
6146 aSig0
= extractFloat128Frac0( a
);
6147 aExp
= extractFloat128Exp( a
);
6148 bSig1
= extractFloat128Frac1( b
);
6149 bSig0
= extractFloat128Frac0( b
);
6150 bExp
= extractFloat128Exp( b
);
6151 expDiff
= aExp
- bExp
;
6152 if ( 0 < expDiff
) {
6153 if ( aExp
== 0x7FFF ) {
6154 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6161 bSig0
|= LIT64( 0x0001000000000000 );
6163 shift128ExtraRightJamming(
6164 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
6167 else if ( expDiff
< 0 ) {
6168 if ( bExp
== 0x7FFF ) {
6169 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6170 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6176 aSig0
|= LIT64( 0x0001000000000000 );
6178 shift128ExtraRightJamming(
6179 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
6183 if ( aExp
== 0x7FFF ) {
6184 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6185 return propagateFloat128NaN( a
, b STATUS_VAR
);
6189 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6191 if (STATUS(flush_to_zero
)) {
6192 if (zSig0
| zSig1
) {
6193 float_raise(float_flag_output_denormal STATUS_VAR
);
6195 return packFloat128(zSign
, 0, 0, 0);
6197 return packFloat128( zSign
, 0, zSig0
, zSig1
);
6200 zSig0
|= LIT64( 0x0002000000000000 );
6204 aSig0
|= LIT64( 0x0001000000000000 );
6205 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6207 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
6210 shift128ExtraRightJamming(
6211 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6213 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6217 /*----------------------------------------------------------------------------
6218 | Returns the result of subtracting the absolute values of the quadruple-
6219 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6220 | difference is negated before being returned. `zSign' is ignored if the
6221 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6222 | Standard for Binary Floating-Point Arithmetic.
6223 *----------------------------------------------------------------------------*/
6225 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
6227 int32 aExp
, bExp
, zExp
;
6228 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
6232 aSig1
= extractFloat128Frac1( a
);
6233 aSig0
= extractFloat128Frac0( a
);
6234 aExp
= extractFloat128Exp( a
);
6235 bSig1
= extractFloat128Frac1( b
);
6236 bSig0
= extractFloat128Frac0( b
);
6237 bExp
= extractFloat128Exp( b
);
6238 expDiff
= aExp
- bExp
;
6239 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
6240 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
6241 if ( 0 < expDiff
) goto aExpBigger
;
6242 if ( expDiff
< 0 ) goto bExpBigger
;
6243 if ( aExp
== 0x7FFF ) {
6244 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
6245 return propagateFloat128NaN( a
, b STATUS_VAR
);
6247 float_raise( float_flag_invalid STATUS_VAR
);
6248 z
.low
= float128_default_nan_low
;
6249 z
.high
= float128_default_nan_high
;
6256 if ( bSig0
< aSig0
) goto aBigger
;
6257 if ( aSig0
< bSig0
) goto bBigger
;
6258 if ( bSig1
< aSig1
) goto aBigger
;
6259 if ( aSig1
< bSig1
) goto bBigger
;
6260 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
6262 if ( bExp
== 0x7FFF ) {
6263 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6264 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6270 aSig0
|= LIT64( 0x4000000000000000 );
6272 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6273 bSig0
|= LIT64( 0x4000000000000000 );
6275 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6278 goto normalizeRoundAndPack
;
6280 if ( aExp
== 0x7FFF ) {
6281 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6288 bSig0
|= LIT64( 0x4000000000000000 );
6290 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6291 aSig0
|= LIT64( 0x4000000000000000 );
6293 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6295 normalizeRoundAndPack
:
6297 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6307 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
6311 aSign
= extractFloat128Sign( a
);
6312 bSign
= extractFloat128Sign( b
);
6313 if ( aSign
== bSign
) {
6314 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6317 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6328 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
6332 aSign
= extractFloat128Sign( a
);
6333 bSign
= extractFloat128Sign( b
);
6334 if ( aSign
== bSign
) {
6335 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6338 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6349 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
6351 flag aSign
, bSign
, zSign
;
6352 int32 aExp
, bExp
, zExp
;
6353 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6356 aSig1
= extractFloat128Frac1( a
);
6357 aSig0
= extractFloat128Frac0( a
);
6358 aExp
= extractFloat128Exp( a
);
6359 aSign
= extractFloat128Sign( a
);
6360 bSig1
= extractFloat128Frac1( b
);
6361 bSig0
= extractFloat128Frac0( b
);
6362 bExp
= extractFloat128Exp( b
);
6363 bSign
= extractFloat128Sign( b
);
6364 zSign
= aSign
^ bSign
;
6365 if ( aExp
== 0x7FFF ) {
6366 if ( ( aSig0
| aSig1
)
6367 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6368 return propagateFloat128NaN( a
, b STATUS_VAR
);
6370 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6371 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6373 if ( bExp
== 0x7FFF ) {
6374 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6375 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6377 float_raise( float_flag_invalid STATUS_VAR
);
6378 z
.low
= float128_default_nan_low
;
6379 z
.high
= float128_default_nan_high
;
6382 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6385 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6386 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6389 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6390 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6392 zExp
= aExp
+ bExp
- 0x4000;
6393 aSig0
|= LIT64( 0x0001000000000000 );
6394 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6395 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6396 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6397 zSig2
|= ( zSig3
!= 0 );
6398 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6399 shift128ExtraRightJamming(
6400 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6403 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6407 /*----------------------------------------------------------------------------
6408 | Returns the result of dividing the quadruple-precision floating-point value
6409 | `a' by the corresponding value `b'. The operation is performed according to
6410 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6413 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
6415 flag aSign
, bSign
, zSign
;
6416 int32 aExp
, bExp
, zExp
;
6417 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6418 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6421 aSig1
= extractFloat128Frac1( a
);
6422 aSig0
= extractFloat128Frac0( a
);
6423 aExp
= extractFloat128Exp( a
);
6424 aSign
= extractFloat128Sign( a
);
6425 bSig1
= extractFloat128Frac1( b
);
6426 bSig0
= extractFloat128Frac0( b
);
6427 bExp
= extractFloat128Exp( b
);
6428 bSign
= extractFloat128Sign( b
);
6429 zSign
= aSign
^ bSign
;
6430 if ( aExp
== 0x7FFF ) {
6431 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6432 if ( bExp
== 0x7FFF ) {
6433 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6436 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6438 if ( bExp
== 0x7FFF ) {
6439 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6440 return packFloat128( zSign
, 0, 0, 0 );
6443 if ( ( bSig0
| bSig1
) == 0 ) {
6444 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6446 float_raise( float_flag_invalid STATUS_VAR
);
6447 z
.low
= float128_default_nan_low
;
6448 z
.high
= float128_default_nan_high
;
6451 float_raise( float_flag_divbyzero STATUS_VAR
);
6452 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6454 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6457 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6458 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6460 zExp
= aExp
- bExp
+ 0x3FFD;
6462 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6464 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6465 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6466 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6469 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6470 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6471 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6472 while ( (int64_t) rem0
< 0 ) {
6474 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6476 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6477 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6478 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6479 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6480 while ( (int64_t) rem1
< 0 ) {
6482 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6484 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6486 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6487 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6491 /*----------------------------------------------------------------------------
6492 | Returns the remainder of the quadruple-precision floating-point value `a'
6493 | with respect to the corresponding value `b'. The operation is performed
6494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6495 *----------------------------------------------------------------------------*/
6497 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
6500 int32 aExp
, bExp
, expDiff
;
6501 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6502 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6506 aSig1
= extractFloat128Frac1( a
);
6507 aSig0
= extractFloat128Frac0( a
);
6508 aExp
= extractFloat128Exp( a
);
6509 aSign
= extractFloat128Sign( a
);
6510 bSig1
= extractFloat128Frac1( b
);
6511 bSig0
= extractFloat128Frac0( b
);
6512 bExp
= extractFloat128Exp( b
);
6513 if ( aExp
== 0x7FFF ) {
6514 if ( ( aSig0
| aSig1
)
6515 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6516 return propagateFloat128NaN( a
, b STATUS_VAR
);
6520 if ( bExp
== 0x7FFF ) {
6521 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6525 if ( ( bSig0
| bSig1
) == 0 ) {
6527 float_raise( float_flag_invalid STATUS_VAR
);
6528 z
.low
= float128_default_nan_low
;
6529 z
.high
= float128_default_nan_high
;
6532 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6535 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6536 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6538 expDiff
= aExp
- bExp
;
6539 if ( expDiff
< -1 ) return a
;
6541 aSig0
| LIT64( 0x0001000000000000 ),
6543 15 - ( expDiff
< 0 ),
6548 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6549 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6550 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6552 while ( 0 < expDiff
) {
6553 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6554 q
= ( 4 < q
) ? q
- 4 : 0;
6555 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6556 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6557 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6558 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6561 if ( -64 < expDiff
) {
6562 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6563 q
= ( 4 < q
) ? q
- 4 : 0;
6565 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6567 if ( expDiff
< 0 ) {
6568 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6571 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6573 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6574 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6577 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6578 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6581 alternateASig0
= aSig0
;
6582 alternateASig1
= aSig1
;
6584 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6585 } while ( 0 <= (int64_t) aSig0
);
6587 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6588 if ( ( sigMean0
< 0 )
6589 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6590 aSig0
= alternateASig0
;
6591 aSig1
= alternateASig1
;
6593 zSign
= ( (int64_t) aSig0
< 0 );
6594 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6596 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6606 float128
float128_sqrt( float128 a STATUS_PARAM
)
6610 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6611 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6614 aSig1
= extractFloat128Frac1( a
);
6615 aSig0
= extractFloat128Frac0( a
);
6616 aExp
= extractFloat128Exp( a
);
6617 aSign
= extractFloat128Sign( a
);
6618 if ( aExp
== 0x7FFF ) {
6619 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
6620 if ( ! aSign
) return a
;
6624 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6626 float_raise( float_flag_invalid STATUS_VAR
);
6627 z
.low
= float128_default_nan_low
;
6628 z
.high
= float128_default_nan_high
;
6632 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6633 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6635 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6636 aSig0
|= LIT64( 0x0001000000000000 );
6637 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6638 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6639 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6640 doubleZSig0
= zSig0
<<1;
6641 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6642 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6643 while ( (int64_t) rem0
< 0 ) {
6646 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6648 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6649 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6650 if ( zSig1
== 0 ) zSig1
= 1;
6651 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6652 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6653 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6654 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6655 while ( (int64_t) rem1
< 0 ) {
6657 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6659 term2
|= doubleZSig0
;
6660 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6662 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6664 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6665 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6669 /*----------------------------------------------------------------------------
6670 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6671 | the corresponding value `b', and 0 otherwise. The invalid exception is
6672 | raised if either operand is a NaN. Otherwise, the comparison is performed
6673 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6674 *----------------------------------------------------------------------------*/
6676 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
6679 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6680 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6681 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6682 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6684 float_raise( float_flag_invalid STATUS_VAR
);
6689 && ( ( a
.high
== b
.high
)
6691 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6696 /*----------------------------------------------------------------------------
6697 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6698 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6699 | exception is raised if either operand is a NaN. The comparison is performed
6700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6701 *----------------------------------------------------------------------------*/
6703 int float128_le( float128 a
, float128 b STATUS_PARAM
)
6707 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6708 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6709 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6710 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6712 float_raise( float_flag_invalid STATUS_VAR
);
6715 aSign
= extractFloat128Sign( a
);
6716 bSign
= extractFloat128Sign( b
);
6717 if ( aSign
!= bSign
) {
6720 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6724 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6725 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6729 /*----------------------------------------------------------------------------
6730 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6731 | the corresponding value `b', and 0 otherwise. The invalid exception is
6732 | raised if either operand is a NaN. The comparison is performed according
6733 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6734 *----------------------------------------------------------------------------*/
6736 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
6740 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6741 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6742 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6743 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6745 float_raise( float_flag_invalid STATUS_VAR
);
6748 aSign
= extractFloat128Sign( a
);
6749 bSign
= extractFloat128Sign( b
);
6750 if ( aSign
!= bSign
) {
6753 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6757 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6758 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6762 /*----------------------------------------------------------------------------
6763 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6764 | be compared, and 0 otherwise. The invalid exception is raised if either
6765 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6766 | Standard for Binary Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6769 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
6771 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6772 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6773 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6774 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6776 float_raise( float_flag_invalid STATUS_VAR
);
6782 /*----------------------------------------------------------------------------
6783 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6784 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6785 | exception. The comparison is performed according to the IEC/IEEE Standard
6786 | for Binary Floating-Point Arithmetic.
6787 *----------------------------------------------------------------------------*/
6789 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
6792 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6793 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6794 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6795 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6797 if ( float128_is_signaling_nan( a
)
6798 || float128_is_signaling_nan( b
) ) {
6799 float_raise( float_flag_invalid STATUS_VAR
);
6805 && ( ( a
.high
== b
.high
)
6807 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6812 /*----------------------------------------------------------------------------
6813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6814 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6815 | cause an exception. Otherwise, the comparison is performed according to the
6816 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6817 *----------------------------------------------------------------------------*/
6819 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
6823 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6824 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6825 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6826 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6828 if ( float128_is_signaling_nan( a
)
6829 || float128_is_signaling_nan( b
) ) {
6830 float_raise( float_flag_invalid STATUS_VAR
);
6834 aSign
= extractFloat128Sign( a
);
6835 bSign
= extractFloat128Sign( b
);
6836 if ( aSign
!= bSign
) {
6839 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6843 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6844 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6848 /*----------------------------------------------------------------------------
6849 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6850 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6851 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6852 | Standard for Binary Floating-Point Arithmetic.
6853 *----------------------------------------------------------------------------*/
6855 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
6859 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6860 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6861 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6862 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6864 if ( float128_is_signaling_nan( a
)
6865 || float128_is_signaling_nan( b
) ) {
6866 float_raise( float_flag_invalid STATUS_VAR
);
6870 aSign
= extractFloat128Sign( a
);
6871 bSign
= extractFloat128Sign( b
);
6872 if ( aSign
!= bSign
) {
6875 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6879 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6880 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6884 /*----------------------------------------------------------------------------
6885 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6886 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6887 | comparison is performed according to the IEC/IEEE Standard for Binary
6888 | Floating-Point Arithmetic.
6889 *----------------------------------------------------------------------------*/
6891 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
6893 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6894 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6895 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6896 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6898 if ( float128_is_signaling_nan( a
)
6899 || float128_is_signaling_nan( b
) ) {
6900 float_raise( float_flag_invalid STATUS_VAR
);
6907 /* misc functions */
6908 float32
uint32_to_float32(uint32_t a STATUS_PARAM
)
6910 return int64_to_float32(a STATUS_VAR
);
6913 float64
uint32_to_float64(uint32_t a STATUS_PARAM
)
6915 return int64_to_float64(a STATUS_VAR
);
6918 uint32
float32_to_uint32( float32 a STATUS_PARAM
)
6922 int old_exc_flags
= get_float_exception_flags(status
);
6924 v
= float32_to_int64(a STATUS_VAR
);
6927 } else if (v
> 0xffffffff) {
6932 set_float_exception_flags(old_exc_flags
, status
);
6933 float_raise(float_flag_invalid STATUS_VAR
);
6937 uint32
float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6941 int old_exc_flags
= get_float_exception_flags(status
);
6943 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6946 } else if (v
> 0xffffffff) {
6951 set_float_exception_flags(old_exc_flags
, status
);
6952 float_raise(float_flag_invalid STATUS_VAR
);
6956 int_fast16_t float32_to_int16(float32 a STATUS_PARAM
)
6960 int old_exc_flags
= get_float_exception_flags(status
);
6962 v
= float32_to_int32(a STATUS_VAR
);
6965 } else if (v
> 0x7fff) {
6971 set_float_exception_flags(old_exc_flags
, status
);
6972 float_raise(float_flag_invalid STATUS_VAR
);
6976 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM
)
6980 int old_exc_flags
= get_float_exception_flags(status
);
6982 v
= float32_to_int32(a STATUS_VAR
);
6985 } else if (v
> 0xffff) {
6991 set_float_exception_flags(old_exc_flags
, status
);
6992 float_raise(float_flag_invalid STATUS_VAR
);
6996 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM
)
7000 int old_exc_flags
= get_float_exception_flags(status
);
7002 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
7005 } else if (v
> 0xffff) {
7010 set_float_exception_flags(old_exc_flags
, status
);
7011 float_raise(float_flag_invalid STATUS_VAR
);
7015 uint32
float64_to_uint32( float64 a STATUS_PARAM
)
7019 int old_exc_flags
= get_float_exception_flags(status
);
7021 v
= float64_to_uint64(a STATUS_VAR
);
7022 if (v
> 0xffffffff) {
7027 set_float_exception_flags(old_exc_flags
, status
);
7028 float_raise(float_flag_invalid STATUS_VAR
);
7032 uint32
float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
7036 int old_exc_flags
= get_float_exception_flags(status
);
7038 v
= float64_to_uint64_round_to_zero(a STATUS_VAR
);
7039 if (v
> 0xffffffff) {
7044 set_float_exception_flags(old_exc_flags
, status
);
7045 float_raise(float_flag_invalid STATUS_VAR
);
7049 int_fast16_t float64_to_int16(float64 a STATUS_PARAM
)
7053 int old_exc_flags
= get_float_exception_flags(status
);
7055 v
= float64_to_int32(a STATUS_VAR
);
7058 } else if (v
> 0x7fff) {
7064 set_float_exception_flags(old_exc_flags
, status
);
7065 float_raise(float_flag_invalid STATUS_VAR
);
7069 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM
)
7073 int old_exc_flags
= get_float_exception_flags(status
);
7075 v
= float64_to_int32(a STATUS_VAR
);
7078 } else if (v
> 0xffff) {
7084 set_float_exception_flags(old_exc_flags
, status
);
7085 float_raise(float_flag_invalid STATUS_VAR
);
7089 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM
)
7093 int old_exc_flags
= get_float_exception_flags(status
);
7095 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
7098 } else if (v
> 0xffff) {
7103 set_float_exception_flags(old_exc_flags
, status
);
7104 float_raise(float_flag_invalid STATUS_VAR
);
7108 /*----------------------------------------------------------------------------
7109 | Returns the result of converting the double-precision floating-point value
7110 | `a' to the 64-bit unsigned integer format. The conversion is
7111 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7112 | Arithmetic---which means in particular that the conversion is rounded
7113 | according to the current rounding mode. If `a' is a NaN, the largest
7114 | positive integer is returned. If the conversion overflows, the
7115 | largest unsigned integer is returned. If 'a' is negative, the value is
7116 | rounded and zero is returned; negative values that do not round to zero
7117 | will raise the inexact exception.
7118 *----------------------------------------------------------------------------*/
7120 uint64_t float64_to_uint64(float64 a STATUS_PARAM
)
7123 int_fast16_t aExp
, shiftCount
;
7124 uint64_t aSig
, aSigExtra
;
7125 a
= float64_squash_input_denormal(a STATUS_VAR
);
7127 aSig
= extractFloat64Frac(a
);
7128 aExp
= extractFloat64Exp(a
);
7129 aSign
= extractFloat64Sign(a
);
7130 if (aSign
&& (aExp
> 1022)) {
7131 float_raise(float_flag_invalid STATUS_VAR
);
7132 if (float64_is_any_nan(a
)) {
7133 return LIT64(0xFFFFFFFFFFFFFFFF);
7139 aSig
|= LIT64(0x0010000000000000);
7141 shiftCount
= 0x433 - aExp
;
7142 if (shiftCount
<= 0) {
7144 float_raise(float_flag_invalid STATUS_VAR
);
7145 return LIT64(0xFFFFFFFFFFFFFFFF);
7148 aSig
<<= -shiftCount
;
7150 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
7152 return roundAndPackUint64(aSign
, aSig
, aSigExtra STATUS_VAR
);
7155 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
7157 signed char current_rounding_mode
= STATUS(float_rounding_mode
);
7158 set_float_rounding_mode(float_round_to_zero STATUS_VAR
);
7159 int64_t v
= float64_to_uint64(a STATUS_VAR
);
7160 set_float_rounding_mode(current_rounding_mode STATUS_VAR
);
7164 #define COMPARE(s, nan_exp) \
7165 static inline int float ## s ## _compare_internal( float ## s a, float ## s b, \
7166 int is_quiet STATUS_PARAM ) \
7168 flag aSign, bSign; \
7169 uint ## s ## _t av, bv; \
7170 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7171 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7173 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7174 extractFloat ## s ## Frac( a ) ) || \
7175 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7176 extractFloat ## s ## Frac( b ) )) { \
7178 float ## s ## _is_signaling_nan( a ) || \
7179 float ## s ## _is_signaling_nan( b ) ) { \
7180 float_raise( float_flag_invalid STATUS_VAR); \
7182 return float_relation_unordered; \
7184 aSign = extractFloat ## s ## Sign( a ); \
7185 bSign = extractFloat ## s ## Sign( b ); \
7186 av = float ## s ## _val(a); \
7187 bv = float ## s ## _val(b); \
7188 if ( aSign != bSign ) { \
7189 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7191 return float_relation_equal; \
7193 return 1 - (2 * aSign); \
7197 return float_relation_equal; \
7199 return 1 - 2 * (aSign ^ ( av < bv )); \
7204 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
7206 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7209 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
7211 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7217 static inline int floatx80_compare_internal( floatx80 a
, floatx80 b
,
7218 int is_quiet STATUS_PARAM
)
7222 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
7223 ( extractFloatx80Frac( a
)<<1 ) ) ||
7224 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
7225 ( extractFloatx80Frac( b
)<<1 ) )) {
7227 floatx80_is_signaling_nan( a
) ||
7228 floatx80_is_signaling_nan( b
) ) {
7229 float_raise( float_flag_invalid STATUS_VAR
);
7231 return float_relation_unordered
;
7233 aSign
= extractFloatx80Sign( a
);
7234 bSign
= extractFloatx80Sign( b
);
7235 if ( aSign
!= bSign
) {
7237 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
7238 ( ( a
.low
| b
.low
) == 0 ) ) {
7240 return float_relation_equal
;
7242 return 1 - (2 * aSign
);
7245 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7246 return float_relation_equal
;
7248 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7253 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
7255 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
7258 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
7260 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
7263 static inline int float128_compare_internal( float128 a
, float128 b
,
7264 int is_quiet STATUS_PARAM
)
7268 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7269 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7270 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7271 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7273 float128_is_signaling_nan( a
) ||
7274 float128_is_signaling_nan( b
) ) {
7275 float_raise( float_flag_invalid STATUS_VAR
);
7277 return float_relation_unordered
;
7279 aSign
= extractFloat128Sign( a
);
7280 bSign
= extractFloat128Sign( b
);
7281 if ( aSign
!= bSign
) {
7282 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7284 return float_relation_equal
;
7286 return 1 - (2 * aSign
);
7289 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7290 return float_relation_equal
;
7292 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7297 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
7299 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
7302 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
7304 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
7307 /* min() and max() functions. These can't be implemented as
7308 * 'compare and pick one input' because that would mishandle
7309 * NaNs and +0 vs -0.
7311 * minnum() and maxnum() functions. These are similar to the min()
7312 * and max() functions but if one of the arguments is a QNaN and
7313 * the other is numerical then the numerical argument is returned.
7314 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7315 * and maxNum() operations. min() and max() are the typical min/max
7316 * semantics provided by many CPUs which predate that specification.
7318 * minnummag() and maxnummag() functions correspond to minNumMag()
7319 * and minNumMag() from the IEEE-754 2008.
7322 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7323 int ismin, int isieee, \
7324 int ismag STATUS_PARAM) \
7326 flag aSign, bSign; \
7327 uint ## s ## _t av, bv, aav, abv; \
7328 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7329 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7330 if (float ## s ## _is_any_nan(a) || \
7331 float ## s ## _is_any_nan(b)) { \
7333 if (float ## s ## _is_quiet_nan(a) && \
7334 !float ## s ##_is_any_nan(b)) { \
7336 } else if (float ## s ## _is_quiet_nan(b) && \
7337 !float ## s ## _is_any_nan(a)) { \
7341 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7343 aSign = extractFloat ## s ## Sign(a); \
7344 bSign = extractFloat ## s ## Sign(b); \
7345 av = float ## s ## _val(a); \
7346 bv = float ## s ## _val(b); \
7348 aav = float ## s ## _abs(av); \
7349 abv = float ## s ## _abs(bv); \
7352 return (aav < abv) ? a : b; \
7354 return (aav < abv) ? b : a; \
7358 if (aSign != bSign) { \
7360 return aSign ? a : b; \
7362 return aSign ? b : a; \
7366 return (aSign ^ (av < bv)) ? a : b; \
7368 return (aSign ^ (av < bv)) ? b : a; \
7373 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7375 return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR); \
7378 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7380 return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR); \
7383 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7385 return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR); \
7388 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7390 return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR); \
7393 float ## s float ## s ## _minnummag(float ## s a, float ## s b STATUS_PARAM) \
7395 return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR); \
7398 float ## s float ## s ## _maxnummag(float ## s a, float ## s b STATUS_PARAM) \
7400 return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR); \
7407 /* Multiply A by 2 raised to the power N. */
7408 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
7414 a
= float32_squash_input_denormal(a STATUS_VAR
);
7415 aSig
= extractFloat32Frac( a
);
7416 aExp
= extractFloat32Exp( a
);
7417 aSign
= extractFloat32Sign( a
);
7419 if ( aExp
== 0xFF ) {
7421 return propagateFloat32NaN( a
, a STATUS_VAR
);
7427 } else if (aSig
== 0) {
7435 } else if (n
< -0x200) {
7441 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
7444 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
7450 a
= float64_squash_input_denormal(a STATUS_VAR
);
7451 aSig
= extractFloat64Frac( a
);
7452 aExp
= extractFloat64Exp( a
);
7453 aSign
= extractFloat64Sign( a
);
7455 if ( aExp
== 0x7FF ) {
7457 return propagateFloat64NaN( a
, a STATUS_VAR
);
7462 aSig
|= LIT64( 0x0010000000000000 );
7463 } else if (aSig
== 0) {
7471 } else if (n
< -0x1000) {
7477 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
7480 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
7486 aSig
= extractFloatx80Frac( a
);
7487 aExp
= extractFloatx80Exp( a
);
7488 aSign
= extractFloatx80Sign( a
);
7490 if ( aExp
== 0x7FFF ) {
7492 return propagateFloatx80NaN( a
, a STATUS_VAR
);
7506 } else if (n
< -0x10000) {
7511 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
7512 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
7515 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
7519 uint64_t aSig0
, aSig1
;
7521 aSig1
= extractFloat128Frac1( a
);
7522 aSig0
= extractFloat128Frac0( a
);
7523 aExp
= extractFloat128Exp( a
);
7524 aSign
= extractFloat128Sign( a
);
7525 if ( aExp
== 0x7FFF ) {
7526 if ( aSig0
| aSig1
) {
7527 return propagateFloat128NaN( a
, a STATUS_VAR
);
7532 aSig0
|= LIT64( 0x0001000000000000 );
7533 } else if (aSig0
== 0 && aSig1
== 0) {
7541 } else if (n
< -0x10000) {
7546 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1