migration: rename cancel to cleanup in SaveVMHandles
[qemu/ar7.git] / fpu / softfloat.c
blobf1170fe500168a113231d0859bbefdf2b08a0b77
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "config.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
90 #include <stdlib.h>
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "softfloat-macros.h"
99 /*----------------------------------------------------------------------------
100 | Functions and definitions to determine: (1) whether tininess for underflow
101 | is detected before or after rounding by default, (2) what (if anything)
102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 | are propagated from function inputs to output. These details are target-
105 | specific.
106 *----------------------------------------------------------------------------*/
107 #include "softfloat-specialize.h"
109 /*----------------------------------------------------------------------------
110 | Returns the fraction bits of the half-precision floating-point value `a'.
111 *----------------------------------------------------------------------------*/
113 static inline uint32_t extractFloat16Frac(float16 a)
115 return float16_val(a) & 0x3ff;
118 /*----------------------------------------------------------------------------
119 | Returns the exponent bits of the half-precision floating-point value `a'.
120 *----------------------------------------------------------------------------*/
122 static inline int_fast16_t extractFloat16Exp(float16 a)
124 return (float16_val(a) >> 10) & 0x1f;
127 /*----------------------------------------------------------------------------
128 | Returns the sign bit of the single-precision floating-point value `a'.
129 *----------------------------------------------------------------------------*/
131 static inline flag extractFloat16Sign(float16 a)
133 return float16_val(a)>>15;
136 /*----------------------------------------------------------------------------
137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
139 | input. If `zSign' is 1, the input is negated before being converted to an
140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141 | is simply rounded to an integer, with the inexact exception raised if the
142 | input cannot be represented exactly as an integer. However, if the fixed-
143 | point input is too large, the invalid exception is raised and the largest
144 | positive or negative integer is returned.
145 *----------------------------------------------------------------------------*/
147 static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
149 int8 roundingMode;
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
152 int32_t z;
154 roundingMode = status->float_rounding_mode;
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
156 switch (roundingMode) {
157 case float_round_nearest_even:
158 case float_round_ties_away:
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
179 float_raise(float_flag_invalid, status);
180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 if (roundBits) {
183 status->float_exception_flags |= float_flag_inexact;
185 return z;
189 /*----------------------------------------------------------------------------
190 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
191 | `absZ1', with binary point between bits 63 and 64 (between the input words),
192 | and returns the properly rounded 64-bit integer corresponding to the input.
193 | If `zSign' is 1, the input is negated before being converted to an integer.
194 | Ordinarily, the fixed-point input is simply rounded to an integer, with
195 | the inexact exception raised if the input cannot be represented exactly as
196 | an integer. However, if the fixed-point input is too large, the invalid
197 | exception is raised and the largest positive or negative integer is
198 | returned.
199 *----------------------------------------------------------------------------*/
201 static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
202 float_status *status)
204 int8 roundingMode;
205 flag roundNearestEven, increment;
206 int64_t z;
208 roundingMode = status->float_rounding_mode;
209 roundNearestEven = ( roundingMode == float_round_nearest_even );
210 switch (roundingMode) {
211 case float_round_nearest_even:
212 case float_round_ties_away:
213 increment = ((int64_t) absZ1 < 0);
214 break;
215 case float_round_to_zero:
216 increment = 0;
217 break;
218 case float_round_up:
219 increment = !zSign && absZ1;
220 break;
221 case float_round_down:
222 increment = zSign && absZ1;
223 break;
224 default:
225 abort();
227 if ( increment ) {
228 ++absZ0;
229 if ( absZ0 == 0 ) goto overflow;
230 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
232 z = absZ0;
233 if ( zSign ) z = - z;
234 if ( z && ( ( z < 0 ) ^ zSign ) ) {
235 overflow:
236 float_raise(float_flag_invalid, status);
237 return
238 zSign ? (int64_t) LIT64( 0x8000000000000000 )
239 : LIT64( 0x7FFFFFFFFFFFFFFF );
241 if (absZ1) {
242 status->float_exception_flags |= float_flag_inexact;
244 return z;
248 /*----------------------------------------------------------------------------
249 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
250 | `absZ1', with binary point between bits 63 and 64 (between the input words),
251 | and returns the properly rounded 64-bit unsigned integer corresponding to the
252 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
253 | with the inexact exception raised if the input cannot be represented exactly
254 | as an integer. However, if the fixed-point input is too large, the invalid
255 | exception is raised and the largest unsigned integer is returned.
256 *----------------------------------------------------------------------------*/
258 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
259 uint64_t absZ1, float_status *status)
261 int8 roundingMode;
262 flag roundNearestEven, increment;
264 roundingMode = status->float_rounding_mode;
265 roundNearestEven = (roundingMode == float_round_nearest_even);
266 switch (roundingMode) {
267 case float_round_nearest_even:
268 case float_round_ties_away:
269 increment = ((int64_t)absZ1 < 0);
270 break;
271 case float_round_to_zero:
272 increment = 0;
273 break;
274 case float_round_up:
275 increment = !zSign && absZ1;
276 break;
277 case float_round_down:
278 increment = zSign && absZ1;
279 break;
280 default:
281 abort();
283 if (increment) {
284 ++absZ0;
285 if (absZ0 == 0) {
286 float_raise(float_flag_invalid, status);
287 return LIT64(0xFFFFFFFFFFFFFFFF);
289 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
292 if (zSign && absZ0) {
293 float_raise(float_flag_invalid, status);
294 return 0;
297 if (absZ1) {
298 status->float_exception_flags |= float_flag_inexact;
300 return absZ0;
303 /*----------------------------------------------------------------------------
304 | Returns the fraction bits of the single-precision floating-point value `a'.
305 *----------------------------------------------------------------------------*/
307 static inline uint32_t extractFloat32Frac( float32 a )
310 return float32_val(a) & 0x007FFFFF;
314 /*----------------------------------------------------------------------------
315 | Returns the exponent bits of the single-precision floating-point value `a'.
316 *----------------------------------------------------------------------------*/
318 static inline int_fast16_t extractFloat32Exp(float32 a)
321 return ( float32_val(a)>>23 ) & 0xFF;
325 /*----------------------------------------------------------------------------
326 | Returns the sign bit of the single-precision floating-point value `a'.
327 *----------------------------------------------------------------------------*/
329 static inline flag extractFloat32Sign( float32 a )
332 return float32_val(a)>>31;
336 /*----------------------------------------------------------------------------
337 | If `a' is denormal and we are in flush-to-zero mode then set the
338 | input-denormal exception and return zero. Otherwise just return the value.
339 *----------------------------------------------------------------------------*/
340 float32 float32_squash_input_denormal(float32 a, float_status *status)
342 if (status->flush_inputs_to_zero) {
343 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
344 float_raise(float_flag_input_denormal, status);
345 return make_float32(float32_val(a) & 0x80000000);
348 return a;
351 /*----------------------------------------------------------------------------
352 | Normalizes the subnormal single-precision floating-point value represented
353 | by the denormalized significand `aSig'. The normalized exponent and
354 | significand are stored at the locations pointed to by `zExpPtr' and
355 | `zSigPtr', respectively.
356 *----------------------------------------------------------------------------*/
358 static void
359 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
361 int8 shiftCount;
363 shiftCount = countLeadingZeros32( aSig ) - 8;
364 *zSigPtr = aSig<<shiftCount;
365 *zExpPtr = 1 - shiftCount;
369 /*----------------------------------------------------------------------------
370 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
371 | single-precision floating-point value, returning the result. After being
372 | shifted into the proper positions, the three fields are simply added
373 | together to form the result. This means that any integer portion of `zSig'
374 | will be added into the exponent. Since a properly normalized significand
375 | will have an integer portion equal to 1, the `zExp' input should be 1 less
376 | than the desired result exponent whenever `zSig' is a complete, normalized
377 | significand.
378 *----------------------------------------------------------------------------*/
380 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
383 return make_float32(
384 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
388 /*----------------------------------------------------------------------------
389 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
390 | and significand `zSig', and returns the proper single-precision floating-
391 | point value corresponding to the abstract input. Ordinarily, the abstract
392 | value is simply rounded and packed into the single-precision format, with
393 | the inexact exception raised if the abstract input cannot be represented
394 | exactly. However, if the abstract value is too large, the overflow and
395 | inexact exceptions are raised and an infinity or maximal finite value is
396 | returned. If the abstract value is too small, the input value is rounded to
397 | a subnormal number, and the underflow and inexact exceptions are raised if
398 | the abstract input cannot be represented exactly as a subnormal single-
399 | precision floating-point number.
400 | The input significand `zSig' has its binary point between bits 30
401 | and 29, which is 7 bits to the left of the usual location. This shifted
402 | significand must be normalized or smaller. If `zSig' is not normalized,
403 | `zExp' must be 0; in that case, the result returned is a subnormal number,
404 | and it must not require rounding. In the usual case that `zSig' is
405 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
406 | The handling of underflow and overflow follows the IEC/IEEE Standard for
407 | Binary Floating-Point Arithmetic.
408 *----------------------------------------------------------------------------*/
410 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
411 float_status *status)
413 int8 roundingMode;
414 flag roundNearestEven;
415 int8 roundIncrement, roundBits;
416 flag isTiny;
418 roundingMode = status->float_rounding_mode;
419 roundNearestEven = ( roundingMode == float_round_nearest_even );
420 switch (roundingMode) {
421 case float_round_nearest_even:
422 case float_round_ties_away:
423 roundIncrement = 0x40;
424 break;
425 case float_round_to_zero:
426 roundIncrement = 0;
427 break;
428 case float_round_up:
429 roundIncrement = zSign ? 0 : 0x7f;
430 break;
431 case float_round_down:
432 roundIncrement = zSign ? 0x7f : 0;
433 break;
434 default:
435 abort();
436 break;
438 roundBits = zSig & 0x7F;
439 if ( 0xFD <= (uint16_t) zExp ) {
440 if ( ( 0xFD < zExp )
441 || ( ( zExp == 0xFD )
442 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
444 float_raise(float_flag_overflow | float_flag_inexact, status);
445 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
447 if ( zExp < 0 ) {
448 if (status->flush_to_zero) {
449 float_raise(float_flag_output_denormal, status);
450 return packFloat32(zSign, 0, 0);
452 isTiny =
453 (status->float_detect_tininess
454 == float_tininess_before_rounding)
455 || ( zExp < -1 )
456 || ( zSig + roundIncrement < 0x80000000 );
457 shift32RightJamming( zSig, - zExp, &zSig );
458 zExp = 0;
459 roundBits = zSig & 0x7F;
460 if (isTiny && roundBits) {
461 float_raise(float_flag_underflow, status);
465 if (roundBits) {
466 status->float_exception_flags |= float_flag_inexact;
468 zSig = ( zSig + roundIncrement )>>7;
469 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
470 if ( zSig == 0 ) zExp = 0;
471 return packFloat32( zSign, zExp, zSig );
475 /*----------------------------------------------------------------------------
476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
477 | and significand `zSig', and returns the proper single-precision floating-
478 | point value corresponding to the abstract input. This routine is just like
479 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
480 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
481 | floating-point exponent.
482 *----------------------------------------------------------------------------*/
484 static float32
485 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
486 float_status *status)
488 int8 shiftCount;
490 shiftCount = countLeadingZeros32( zSig ) - 1;
491 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
492 status);
496 /*----------------------------------------------------------------------------
497 | Returns the fraction bits of the double-precision floating-point value `a'.
498 *----------------------------------------------------------------------------*/
500 static inline uint64_t extractFloat64Frac( float64 a )
503 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
507 /*----------------------------------------------------------------------------
508 | Returns the exponent bits of the double-precision floating-point value `a'.
509 *----------------------------------------------------------------------------*/
511 static inline int_fast16_t extractFloat64Exp(float64 a)
514 return ( float64_val(a)>>52 ) & 0x7FF;
518 /*----------------------------------------------------------------------------
519 | Returns the sign bit of the double-precision floating-point value `a'.
520 *----------------------------------------------------------------------------*/
522 static inline flag extractFloat64Sign( float64 a )
525 return float64_val(a)>>63;
529 /*----------------------------------------------------------------------------
530 | If `a' is denormal and we are in flush-to-zero mode then set the
531 | input-denormal exception and return zero. Otherwise just return the value.
532 *----------------------------------------------------------------------------*/
533 float64 float64_squash_input_denormal(float64 a, float_status *status)
535 if (status->flush_inputs_to_zero) {
536 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
537 float_raise(float_flag_input_denormal, status);
538 return make_float64(float64_val(a) & (1ULL << 63));
541 return a;
544 /*----------------------------------------------------------------------------
545 | Normalizes the subnormal double-precision floating-point value represented
546 | by the denormalized significand `aSig'. The normalized exponent and
547 | significand are stored at the locations pointed to by `zExpPtr' and
548 | `zSigPtr', respectively.
549 *----------------------------------------------------------------------------*/
551 static void
552 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
554 int8 shiftCount;
556 shiftCount = countLeadingZeros64( aSig ) - 11;
557 *zSigPtr = aSig<<shiftCount;
558 *zExpPtr = 1 - shiftCount;
562 /*----------------------------------------------------------------------------
563 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
564 | double-precision floating-point value, returning the result. After being
565 | shifted into the proper positions, the three fields are simply added
566 | together to form the result. This means that any integer portion of `zSig'
567 | will be added into the exponent. Since a properly normalized significand
568 | will have an integer portion equal to 1, the `zExp' input should be 1 less
569 | than the desired result exponent whenever `zSig' is a complete, normalized
570 | significand.
571 *----------------------------------------------------------------------------*/
573 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
576 return make_float64(
577 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
581 /*----------------------------------------------------------------------------
582 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
583 | and significand `zSig', and returns the proper double-precision floating-
584 | point value corresponding to the abstract input. Ordinarily, the abstract
585 | value is simply rounded and packed into the double-precision format, with
586 | the inexact exception raised if the abstract input cannot be represented
587 | exactly. However, if the abstract value is too large, the overflow and
588 | inexact exceptions are raised and an infinity or maximal finite value is
589 | returned. If the abstract value is too small, the input value is rounded to
590 | a subnormal number, and the underflow and inexact exceptions are raised if
591 | the abstract input cannot be represented exactly as a subnormal double-
592 | precision floating-point number.
593 | The input significand `zSig' has its binary point between bits 62
594 | and 61, which is 10 bits to the left of the usual location. This shifted
595 | significand must be normalized or smaller. If `zSig' is not normalized,
596 | `zExp' must be 0; in that case, the result returned is a subnormal number,
597 | and it must not require rounding. In the usual case that `zSig' is
598 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
599 | The handling of underflow and overflow follows the IEC/IEEE Standard for
600 | Binary Floating-Point Arithmetic.
601 *----------------------------------------------------------------------------*/
603 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
604 float_status *status)
606 int8 roundingMode;
607 flag roundNearestEven;
608 int_fast16_t roundIncrement, roundBits;
609 flag isTiny;
611 roundingMode = status->float_rounding_mode;
612 roundNearestEven = ( roundingMode == float_round_nearest_even );
613 switch (roundingMode) {
614 case float_round_nearest_even:
615 case float_round_ties_away:
616 roundIncrement = 0x200;
617 break;
618 case float_round_to_zero:
619 roundIncrement = 0;
620 break;
621 case float_round_up:
622 roundIncrement = zSign ? 0 : 0x3ff;
623 break;
624 case float_round_down:
625 roundIncrement = zSign ? 0x3ff : 0;
626 break;
627 default:
628 abort();
630 roundBits = zSig & 0x3FF;
631 if ( 0x7FD <= (uint16_t) zExp ) {
632 if ( ( 0x7FD < zExp )
633 || ( ( zExp == 0x7FD )
634 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
636 float_raise(float_flag_overflow | float_flag_inexact, status);
637 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
639 if ( zExp < 0 ) {
640 if (status->flush_to_zero) {
641 float_raise(float_flag_output_denormal, status);
642 return packFloat64(zSign, 0, 0);
644 isTiny =
645 (status->float_detect_tininess
646 == float_tininess_before_rounding)
647 || ( zExp < -1 )
648 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
649 shift64RightJamming( zSig, - zExp, &zSig );
650 zExp = 0;
651 roundBits = zSig & 0x3FF;
652 if (isTiny && roundBits) {
653 float_raise(float_flag_underflow, status);
657 if (roundBits) {
658 status->float_exception_flags |= float_flag_inexact;
660 zSig = ( zSig + roundIncrement )>>10;
661 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
662 if ( zSig == 0 ) zExp = 0;
663 return packFloat64( zSign, zExp, zSig );
667 /*----------------------------------------------------------------------------
668 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
669 | and significand `zSig', and returns the proper double-precision floating-
670 | point value corresponding to the abstract input. This routine is just like
671 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
672 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
673 | floating-point exponent.
674 *----------------------------------------------------------------------------*/
676 static float64
677 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
678 float_status *status)
680 int8 shiftCount;
682 shiftCount = countLeadingZeros64( zSig ) - 1;
683 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
684 status);
688 /*----------------------------------------------------------------------------
689 | Returns the fraction bits of the extended double-precision floating-point
690 | value `a'.
691 *----------------------------------------------------------------------------*/
693 static inline uint64_t extractFloatx80Frac( floatx80 a )
696 return a.low;
700 /*----------------------------------------------------------------------------
701 | Returns the exponent bits of the extended double-precision floating-point
702 | value `a'.
703 *----------------------------------------------------------------------------*/
705 static inline int32 extractFloatx80Exp( floatx80 a )
708 return a.high & 0x7FFF;
712 /*----------------------------------------------------------------------------
713 | Returns the sign bit of the extended double-precision floating-point value
714 | `a'.
715 *----------------------------------------------------------------------------*/
717 static inline flag extractFloatx80Sign( floatx80 a )
720 return a.high>>15;
724 /*----------------------------------------------------------------------------
725 | Normalizes the subnormal extended double-precision floating-point value
726 | represented by the denormalized significand `aSig'. The normalized exponent
727 | and significand are stored at the locations pointed to by `zExpPtr' and
728 | `zSigPtr', respectively.
729 *----------------------------------------------------------------------------*/
731 static void
732 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
734 int8 shiftCount;
736 shiftCount = countLeadingZeros64( aSig );
737 *zSigPtr = aSig<<shiftCount;
738 *zExpPtr = 1 - shiftCount;
742 /*----------------------------------------------------------------------------
743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
744 | extended double-precision floating-point value, returning the result.
745 *----------------------------------------------------------------------------*/
747 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
749 floatx80 z;
751 z.low = zSig;
752 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
753 return z;
757 /*----------------------------------------------------------------------------
758 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
759 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
760 | and returns the proper extended double-precision floating-point value
761 | corresponding to the abstract input. Ordinarily, the abstract value is
762 | rounded and packed into the extended double-precision format, with the
763 | inexact exception raised if the abstract input cannot be represented
764 | exactly. However, if the abstract value is too large, the overflow and
765 | inexact exceptions are raised and an infinity or maximal finite value is
766 | returned. If the abstract value is too small, the input value is rounded to
767 | a subnormal number, and the underflow and inexact exceptions are raised if
768 | the abstract input cannot be represented exactly as a subnormal extended
769 | double-precision floating-point number.
770 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
771 | number of bits as single or double precision, respectively. Otherwise, the
772 | result is rounded to the full precision of the extended double-precision
773 | format.
774 | The input significand must be normalized or smaller. If the input
775 | significand is not normalized, `zExp' must be 0; in that case, the result
776 | returned is a subnormal number, and it must not require rounding. The
777 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
778 | Floating-Point Arithmetic.
779 *----------------------------------------------------------------------------*/
781 static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
782 int32 zExp, uint64_t zSig0, uint64_t zSig1,
783 float_status *status)
785 int8 roundingMode;
786 flag roundNearestEven, increment, isTiny;
787 int64 roundIncrement, roundMask, roundBits;
789 roundingMode = status->float_rounding_mode;
790 roundNearestEven = ( roundingMode == float_round_nearest_even );
791 if ( roundingPrecision == 80 ) goto precision80;
792 if ( roundingPrecision == 64 ) {
793 roundIncrement = LIT64( 0x0000000000000400 );
794 roundMask = LIT64( 0x00000000000007FF );
796 else if ( roundingPrecision == 32 ) {
797 roundIncrement = LIT64( 0x0000008000000000 );
798 roundMask = LIT64( 0x000000FFFFFFFFFF );
800 else {
801 goto precision80;
803 zSig0 |= ( zSig1 != 0 );
804 switch (roundingMode) {
805 case float_round_nearest_even:
806 case float_round_ties_away:
807 break;
808 case float_round_to_zero:
809 roundIncrement = 0;
810 break;
811 case float_round_up:
812 roundIncrement = zSign ? 0 : roundMask;
813 break;
814 case float_round_down:
815 roundIncrement = zSign ? roundMask : 0;
816 break;
817 default:
818 abort();
820 roundBits = zSig0 & roundMask;
821 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
822 if ( ( 0x7FFE < zExp )
823 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
825 goto overflow;
827 if ( zExp <= 0 ) {
828 if (status->flush_to_zero) {
829 float_raise(float_flag_output_denormal, status);
830 return packFloatx80(zSign, 0, 0);
832 isTiny =
833 (status->float_detect_tininess
834 == float_tininess_before_rounding)
835 || ( zExp < 0 )
836 || ( zSig0 <= zSig0 + roundIncrement );
837 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
838 zExp = 0;
839 roundBits = zSig0 & roundMask;
840 if (isTiny && roundBits) {
841 float_raise(float_flag_underflow, status);
843 if (roundBits) {
844 status->float_exception_flags |= float_flag_inexact;
846 zSig0 += roundIncrement;
847 if ( (int64_t) zSig0 < 0 ) zExp = 1;
848 roundIncrement = roundMask + 1;
849 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
850 roundMask |= roundIncrement;
852 zSig0 &= ~ roundMask;
853 return packFloatx80( zSign, zExp, zSig0 );
856 if (roundBits) {
857 status->float_exception_flags |= float_flag_inexact;
859 zSig0 += roundIncrement;
860 if ( zSig0 < roundIncrement ) {
861 ++zExp;
862 zSig0 = LIT64( 0x8000000000000000 );
864 roundIncrement = roundMask + 1;
865 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
866 roundMask |= roundIncrement;
868 zSig0 &= ~ roundMask;
869 if ( zSig0 == 0 ) zExp = 0;
870 return packFloatx80( zSign, zExp, zSig0 );
871 precision80:
872 switch (roundingMode) {
873 case float_round_nearest_even:
874 case float_round_ties_away:
875 increment = ((int64_t)zSig1 < 0);
876 break;
877 case float_round_to_zero:
878 increment = 0;
879 break;
880 case float_round_up:
881 increment = !zSign && zSig1;
882 break;
883 case float_round_down:
884 increment = zSign && zSig1;
885 break;
886 default:
887 abort();
889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
890 if ( ( 0x7FFE < zExp )
891 || ( ( zExp == 0x7FFE )
892 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
893 && increment
896 roundMask = 0;
897 overflow:
898 float_raise(float_flag_overflow | float_flag_inexact, status);
899 if ( ( roundingMode == float_round_to_zero )
900 || ( zSign && ( roundingMode == float_round_up ) )
901 || ( ! zSign && ( roundingMode == float_round_down ) )
903 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
905 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
907 if ( zExp <= 0 ) {
908 isTiny =
909 (status->float_detect_tininess
910 == float_tininess_before_rounding)
911 || ( zExp < 0 )
912 || ! increment
913 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
914 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
915 zExp = 0;
916 if (isTiny && zSig1) {
917 float_raise(float_flag_underflow, status);
919 if (zSig1) {
920 status->float_exception_flags |= float_flag_inexact;
922 switch (roundingMode) {
923 case float_round_nearest_even:
924 case float_round_ties_away:
925 increment = ((int64_t)zSig1 < 0);
926 break;
927 case float_round_to_zero:
928 increment = 0;
929 break;
930 case float_round_up:
931 increment = !zSign && zSig1;
932 break;
933 case float_round_down:
934 increment = zSign && zSig1;
935 break;
936 default:
937 abort();
939 if ( increment ) {
940 ++zSig0;
941 zSig0 &=
942 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
943 if ( (int64_t) zSig0 < 0 ) zExp = 1;
945 return packFloatx80( zSign, zExp, zSig0 );
948 if (zSig1) {
949 status->float_exception_flags |= float_flag_inexact;
951 if ( increment ) {
952 ++zSig0;
953 if ( zSig0 == 0 ) {
954 ++zExp;
955 zSig0 = LIT64( 0x8000000000000000 );
957 else {
958 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
961 else {
962 if ( zSig0 == 0 ) zExp = 0;
964 return packFloatx80( zSign, zExp, zSig0 );
968 /*----------------------------------------------------------------------------
969 | Takes an abstract floating-point value having sign `zSign', exponent
970 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
971 | and returns the proper extended double-precision floating-point value
972 | corresponding to the abstract input. This routine is just like
973 | `roundAndPackFloatx80' except that the input significand does not have to be
974 | normalized.
975 *----------------------------------------------------------------------------*/
977 static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
978 flag zSign, int32 zExp,
979 uint64_t zSig0, uint64_t zSig1,
980 float_status *status)
982 int8 shiftCount;
984 if ( zSig0 == 0 ) {
985 zSig0 = zSig1;
986 zSig1 = 0;
987 zExp -= 64;
989 shiftCount = countLeadingZeros64( zSig0 );
990 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
991 zExp -= shiftCount;
992 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
993 zSig0, zSig1, status);
997 /*----------------------------------------------------------------------------
998 | Returns the least-significant 64 fraction bits of the quadruple-precision
999 | floating-point value `a'.
1000 *----------------------------------------------------------------------------*/
1002 static inline uint64_t extractFloat128Frac1( float128 a )
1005 return a.low;
1009 /*----------------------------------------------------------------------------
1010 | Returns the most-significant 48 fraction bits of the quadruple-precision
1011 | floating-point value `a'.
1012 *----------------------------------------------------------------------------*/
1014 static inline uint64_t extractFloat128Frac0( float128 a )
1017 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1021 /*----------------------------------------------------------------------------
1022 | Returns the exponent bits of the quadruple-precision floating-point value
1023 | `a'.
1024 *----------------------------------------------------------------------------*/
1026 static inline int32 extractFloat128Exp( float128 a )
1029 return ( a.high>>48 ) & 0x7FFF;
1033 /*----------------------------------------------------------------------------
1034 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1035 *----------------------------------------------------------------------------*/
1037 static inline flag extractFloat128Sign( float128 a )
1040 return a.high>>63;
1044 /*----------------------------------------------------------------------------
1045 | Normalizes the subnormal quadruple-precision floating-point value
1046 | represented by the denormalized significand formed by the concatenation of
1047 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1048 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1049 | significand are stored at the location pointed to by `zSig0Ptr', and the
1050 | least significant 64 bits of the normalized significand are stored at the
1051 | location pointed to by `zSig1Ptr'.
1052 *----------------------------------------------------------------------------*/
1054 static void
1055 normalizeFloat128Subnormal(
1056 uint64_t aSig0,
1057 uint64_t aSig1,
1058 int32 *zExpPtr,
1059 uint64_t *zSig0Ptr,
1060 uint64_t *zSig1Ptr
1063 int8 shiftCount;
1065 if ( aSig0 == 0 ) {
1066 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1067 if ( shiftCount < 0 ) {
1068 *zSig0Ptr = aSig1>>( - shiftCount );
1069 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1071 else {
1072 *zSig0Ptr = aSig1<<shiftCount;
1073 *zSig1Ptr = 0;
1075 *zExpPtr = - shiftCount - 63;
1077 else {
1078 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1079 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1080 *zExpPtr = 1 - shiftCount;
1085 /*----------------------------------------------------------------------------
1086 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1087 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1088 | floating-point value, returning the result. After being shifted into the
1089 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1090 | added together to form the most significant 32 bits of the result. This
1091 | means that any integer portion of `zSig0' will be added into the exponent.
1092 | Since a properly normalized significand will have an integer portion equal
1093 | to 1, the `zExp' input should be 1 less than the desired result exponent
1094 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1095 | significand.
1096 *----------------------------------------------------------------------------*/
1098 static inline float128
1099 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1101 float128 z;
1103 z.low = zSig1;
1104 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1105 return z;
1109 /*----------------------------------------------------------------------------
1110 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1111 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1112 | and `zSig2', and returns the proper quadruple-precision floating-point value
1113 | corresponding to the abstract input. Ordinarily, the abstract value is
1114 | simply rounded and packed into the quadruple-precision format, with the
1115 | inexact exception raised if the abstract input cannot be represented
1116 | exactly. However, if the abstract value is too large, the overflow and
1117 | inexact exceptions are raised and an infinity or maximal finite value is
1118 | returned. If the abstract value is too small, the input value is rounded to
1119 | a subnormal number, and the underflow and inexact exceptions are raised if
1120 | the abstract input cannot be represented exactly as a subnormal quadruple-
1121 | precision floating-point number.
1122 | The input significand must be normalized or smaller. If the input
1123 | significand is not normalized, `zExp' must be 0; in that case, the result
1124 | returned is a subnormal number, and it must not require rounding. In the
1125 | usual case that the input significand is normalized, `zExp' must be 1 less
1126 | than the ``true'' floating-point exponent. The handling of underflow and
1127 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1128 *----------------------------------------------------------------------------*/
1130 static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1131 uint64_t zSig0, uint64_t zSig1,
1132 uint64_t zSig2, float_status *status)
1134 int8 roundingMode;
1135 flag roundNearestEven, increment, isTiny;
1137 roundingMode = status->float_rounding_mode;
1138 roundNearestEven = ( roundingMode == float_round_nearest_even );
1139 switch (roundingMode) {
1140 case float_round_nearest_even:
1141 case float_round_ties_away:
1142 increment = ((int64_t)zSig2 < 0);
1143 break;
1144 case float_round_to_zero:
1145 increment = 0;
1146 break;
1147 case float_round_up:
1148 increment = !zSign && zSig2;
1149 break;
1150 case float_round_down:
1151 increment = zSign && zSig2;
1152 break;
1153 default:
1154 abort();
1156 if ( 0x7FFD <= (uint32_t) zExp ) {
1157 if ( ( 0x7FFD < zExp )
1158 || ( ( zExp == 0x7FFD )
1159 && eq128(
1160 LIT64( 0x0001FFFFFFFFFFFF ),
1161 LIT64( 0xFFFFFFFFFFFFFFFF ),
1162 zSig0,
1163 zSig1
1165 && increment
1168 float_raise(float_flag_overflow | float_flag_inexact, status);
1169 if ( ( roundingMode == float_round_to_zero )
1170 || ( zSign && ( roundingMode == float_round_up ) )
1171 || ( ! zSign && ( roundingMode == float_round_down ) )
1173 return
1174 packFloat128(
1175 zSign,
1176 0x7FFE,
1177 LIT64( 0x0000FFFFFFFFFFFF ),
1178 LIT64( 0xFFFFFFFFFFFFFFFF )
1181 return packFloat128( zSign, 0x7FFF, 0, 0 );
1183 if ( zExp < 0 ) {
1184 if (status->flush_to_zero) {
1185 float_raise(float_flag_output_denormal, status);
1186 return packFloat128(zSign, 0, 0, 0);
1188 isTiny =
1189 (status->float_detect_tininess
1190 == float_tininess_before_rounding)
1191 || ( zExp < -1 )
1192 || ! increment
1193 || lt128(
1194 zSig0,
1195 zSig1,
1196 LIT64( 0x0001FFFFFFFFFFFF ),
1197 LIT64( 0xFFFFFFFFFFFFFFFF )
1199 shift128ExtraRightJamming(
1200 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1201 zExp = 0;
1202 if (isTiny && zSig2) {
1203 float_raise(float_flag_underflow, status);
1205 switch (roundingMode) {
1206 case float_round_nearest_even:
1207 case float_round_ties_away:
1208 increment = ((int64_t)zSig2 < 0);
1209 break;
1210 case float_round_to_zero:
1211 increment = 0;
1212 break;
1213 case float_round_up:
1214 increment = !zSign && zSig2;
1215 break;
1216 case float_round_down:
1217 increment = zSign && zSig2;
1218 break;
1219 default:
1220 abort();
1224 if (zSig2) {
1225 status->float_exception_flags |= float_flag_inexact;
1227 if ( increment ) {
1228 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1229 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1231 else {
1232 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1234 return packFloat128( zSign, zExp, zSig0, zSig1 );
1238 /*----------------------------------------------------------------------------
1239 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1240 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1241 | returns the proper quadruple-precision floating-point value corresponding
1242 | to the abstract input. This routine is just like `roundAndPackFloat128'
1243 | except that the input significand has fewer bits and does not have to be
1244 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1245 | point exponent.
1246 *----------------------------------------------------------------------------*/
1248 static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1249 uint64_t zSig0, uint64_t zSig1,
1250 float_status *status)
1252 int8 shiftCount;
1253 uint64_t zSig2;
1255 if ( zSig0 == 0 ) {
1256 zSig0 = zSig1;
1257 zSig1 = 0;
1258 zExp -= 64;
1260 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1261 if ( 0 <= shiftCount ) {
1262 zSig2 = 0;
1263 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1265 else {
1266 shift128ExtraRightJamming(
1267 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1269 zExp -= shiftCount;
1270 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the single-precision floating-point format. The conversion is performed
1277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1278 *----------------------------------------------------------------------------*/
1280 float32 int32_to_float32(int32_t a, float_status *status)
1282 flag zSign;
1284 if ( a == 0 ) return float32_zero;
1285 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1286 zSign = ( a < 0 );
1287 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 32-bit two's complement integer `a'
1292 | to the double-precision floating-point format. The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1296 float64 int32_to_float64(int32_t a, float_status *status)
1298 flag zSign;
1299 uint32 absA;
1300 int8 shiftCount;
1301 uint64_t zSig;
1303 if ( a == 0 ) return float64_zero;
1304 zSign = ( a < 0 );
1305 absA = zSign ? - a : a;
1306 shiftCount = countLeadingZeros32( absA ) + 21;
1307 zSig = absA;
1308 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of converting the 32-bit two's complement integer `a'
1314 | to the extended double-precision floating-point format. The conversion
1315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 | Arithmetic.
1317 *----------------------------------------------------------------------------*/
1319 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1321 flag zSign;
1322 uint32 absA;
1323 int8 shiftCount;
1324 uint64_t zSig;
1326 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1327 zSign = ( a < 0 );
1328 absA = zSign ? - a : a;
1329 shiftCount = countLeadingZeros32( absA ) + 32;
1330 zSig = absA;
1331 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the 32-bit two's complement integer `a' to
1337 | the quadruple-precision floating-point format. The conversion is performed
1338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 *----------------------------------------------------------------------------*/
1341 float128 int32_to_float128(int32_t a, float_status *status)
1343 flag zSign;
1344 uint32 absA;
1345 int8 shiftCount;
1346 uint64_t zSig0;
1348 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349 zSign = ( a < 0 );
1350 absA = zSign ? - a : a;
1351 shiftCount = countLeadingZeros32( absA ) + 17;
1352 zSig0 = absA;
1353 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1357 /*----------------------------------------------------------------------------
1358 | Returns the result of converting the 64-bit two's complement integer `a'
1359 | to the single-precision floating-point format. The conversion is performed
1360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1361 *----------------------------------------------------------------------------*/
1363 float32 int64_to_float32(int64_t a, float_status *status)
1365 flag zSign;
1366 uint64 absA;
1367 int8 shiftCount;
1369 if ( a == 0 ) return float32_zero;
1370 zSign = ( a < 0 );
1371 absA = zSign ? - a : a;
1372 shiftCount = countLeadingZeros64( absA ) - 40;
1373 if ( 0 <= shiftCount ) {
1374 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1376 else {
1377 shiftCount += 7;
1378 if ( shiftCount < 0 ) {
1379 shift64RightJamming( absA, - shiftCount, &absA );
1381 else {
1382 absA <<= shiftCount;
1384 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1389 /*----------------------------------------------------------------------------
1390 | Returns the result of converting the 64-bit two's complement integer `a'
1391 | to the double-precision floating-point format. The conversion is performed
1392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1393 *----------------------------------------------------------------------------*/
1395 float64 int64_to_float64(int64_t a, float_status *status)
1397 flag zSign;
1399 if ( a == 0 ) return float64_zero;
1400 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1401 return packFloat64( 1, 0x43E, 0 );
1403 zSign = ( a < 0 );
1404 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the extended double-precision floating-point format. The conversion
1410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 | Arithmetic.
1412 *----------------------------------------------------------------------------*/
1414 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1416 flag zSign;
1417 uint64 absA;
1418 int8 shiftCount;
1420 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1421 zSign = ( a < 0 );
1422 absA = zSign ? - a : a;
1423 shiftCount = countLeadingZeros64( absA );
1424 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the 64-bit two's complement integer `a' to
1430 | the quadruple-precision floating-point format. The conversion is performed
1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432 *----------------------------------------------------------------------------*/
1434 float128 int64_to_float128(int64_t a, float_status *status)
1436 flag zSign;
1437 uint64 absA;
1438 int8 shiftCount;
1439 int32 zExp;
1440 uint64_t zSig0, zSig1;
1442 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1443 zSign = ( a < 0 );
1444 absA = zSign ? - a : a;
1445 shiftCount = countLeadingZeros64( absA ) + 49;
1446 zExp = 0x406E - shiftCount;
1447 if ( 64 <= shiftCount ) {
1448 zSig1 = 0;
1449 zSig0 = absA;
1450 shiftCount -= 64;
1452 else {
1453 zSig1 = absA;
1454 zSig0 = 0;
1456 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1457 return packFloat128( zSign, zExp, zSig0, zSig1 );
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the 64-bit unsigned integer `a'
1463 | to the single-precision floating-point format. The conversion is performed
1464 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1465 *----------------------------------------------------------------------------*/
1467 float32 uint64_to_float32(uint64_t a, float_status *status)
1469 int shiftcount;
1471 if (a == 0) {
1472 return float32_zero;
1475 /* Determine (left) shift needed to put first set bit into bit posn 23
1476 * (since packFloat32() expects the binary point between bits 23 and 22);
1477 * this is the fast case for smallish numbers.
1479 shiftcount = countLeadingZeros64(a) - 40;
1480 if (shiftcount >= 0) {
1481 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1483 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1484 * expects the binary point between bits 30 and 29, hence the + 7.
1486 shiftcount += 7;
1487 if (shiftcount < 0) {
1488 shift64RightJamming(a, -shiftcount, &a);
1489 } else {
1490 a <<= shiftcount;
1493 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the 64-bit unsigned integer `a'
1498 | to the double-precision floating-point format. The conversion is performed
1499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1500 *----------------------------------------------------------------------------*/
1502 float64 uint64_to_float64(uint64_t a, float_status *status)
1504 int exp = 0x43C;
1505 int shiftcount;
1507 if (a == 0) {
1508 return float64_zero;
1511 shiftcount = countLeadingZeros64(a) - 1;
1512 if (shiftcount < 0) {
1513 shift64RightJamming(a, -shiftcount, &a);
1514 } else {
1515 a <<= shiftcount;
1517 return roundAndPackFloat64(0, exp - shiftcount, a, status);
1520 /*----------------------------------------------------------------------------
1521 | Returns the result of converting the 64-bit unsigned integer `a'
1522 | to the quadruple-precision floating-point format. The conversion is performed
1523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524 *----------------------------------------------------------------------------*/
1526 float128 uint64_to_float128(uint64_t a, float_status *status)
1528 if (a == 0) {
1529 return float128_zero;
1531 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1534 /*----------------------------------------------------------------------------
1535 | Returns the result of converting the single-precision floating-point value
1536 | `a' to the 32-bit two's complement integer format. The conversion is
1537 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 | Arithmetic---which means in particular that the conversion is rounded
1539 | according to the current rounding mode. If `a' is a NaN, the largest
1540 | positive integer is returned. Otherwise, if the conversion overflows, the
1541 | largest integer with the same sign as `a' is returned.
1542 *----------------------------------------------------------------------------*/
1544 int32 float32_to_int32(float32 a, float_status *status)
1546 flag aSign;
1547 int_fast16_t aExp, shiftCount;
1548 uint32_t aSig;
1549 uint64_t aSig64;
1551 a = float32_squash_input_denormal(a, status);
1552 aSig = extractFloat32Frac( a );
1553 aExp = extractFloat32Exp( a );
1554 aSign = extractFloat32Sign( a );
1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556 if ( aExp ) aSig |= 0x00800000;
1557 shiftCount = 0xAF - aExp;
1558 aSig64 = aSig;
1559 aSig64 <<= 32;
1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561 return roundAndPackInt32(aSign, aSig64, status);
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format. The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1572 | returned.
1573 *----------------------------------------------------------------------------*/
1575 int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
1577 flag aSign;
1578 int_fast16_t aExp, shiftCount;
1579 uint32_t aSig;
1580 int32_t z;
1581 a = float32_squash_input_denormal(a, status);
1583 aSig = extractFloat32Frac( a );
1584 aExp = extractFloat32Exp( a );
1585 aSign = extractFloat32Sign( a );
1586 shiftCount = aExp - 0x9E;
1587 if ( 0 <= shiftCount ) {
1588 if ( float32_val(a) != 0xCF000000 ) {
1589 float_raise(float_flag_invalid, status);
1590 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592 return (int32_t) 0x80000000;
1594 else if ( aExp <= 0x7E ) {
1595 if (aExp | aSig) {
1596 status->float_exception_flags |= float_flag_inexact;
1598 return 0;
1600 aSig = ( aSig | 0x00800000 )<<8;
1601 z = aSig>>( - shiftCount );
1602 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1603 status->float_exception_flags |= float_flag_inexact;
1605 if ( aSign ) z = - z;
1606 return z;
1610 /*----------------------------------------------------------------------------
1611 | Returns the result of converting the single-precision floating-point value
1612 | `a' to the 16-bit two's complement integer format. The conversion is
1613 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1614 | Arithmetic, except that the conversion is always rounded toward zero.
1615 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1616 | the conversion overflows, the largest integer with the same sign as `a' is
1617 | returned.
1618 *----------------------------------------------------------------------------*/
1620 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1622 flag aSign;
1623 int_fast16_t aExp, shiftCount;
1624 uint32_t aSig;
1625 int32 z;
1627 aSig = extractFloat32Frac( a );
1628 aExp = extractFloat32Exp( a );
1629 aSign = extractFloat32Sign( a );
1630 shiftCount = aExp - 0x8E;
1631 if ( 0 <= shiftCount ) {
1632 if ( float32_val(a) != 0xC7000000 ) {
1633 float_raise(float_flag_invalid, status);
1634 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1635 return 0x7FFF;
1638 return (int32_t) 0xffff8000;
1640 else if ( aExp <= 0x7E ) {
1641 if ( aExp | aSig ) {
1642 status->float_exception_flags |= float_flag_inexact;
1644 return 0;
1646 shiftCount -= 0x10;
1647 aSig = ( aSig | 0x00800000 )<<8;
1648 z = aSig>>( - shiftCount );
1649 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1650 status->float_exception_flags |= float_flag_inexact;
1652 if ( aSign ) {
1653 z = - z;
1655 return z;
1659 /*----------------------------------------------------------------------------
1660 | Returns the result of converting the single-precision floating-point value
1661 | `a' to the 64-bit two's complement integer format. The conversion is
1662 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1663 | Arithmetic---which means in particular that the conversion is rounded
1664 | according to the current rounding mode. If `a' is a NaN, the largest
1665 | positive integer is returned. Otherwise, if the conversion overflows, the
1666 | largest integer with the same sign as `a' is returned.
1667 *----------------------------------------------------------------------------*/
1669 int64 float32_to_int64(float32 a, float_status *status)
1671 flag aSign;
1672 int_fast16_t aExp, shiftCount;
1673 uint32_t aSig;
1674 uint64_t aSig64, aSigExtra;
1675 a = float32_squash_input_denormal(a, status);
1677 aSig = extractFloat32Frac( a );
1678 aExp = extractFloat32Exp( a );
1679 aSign = extractFloat32Sign( a );
1680 shiftCount = 0xBE - aExp;
1681 if ( shiftCount < 0 ) {
1682 float_raise(float_flag_invalid, status);
1683 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1684 return LIT64( 0x7FFFFFFFFFFFFFFF );
1686 return (int64_t) LIT64( 0x8000000000000000 );
1688 if ( aExp ) aSig |= 0x00800000;
1689 aSig64 = aSig;
1690 aSig64 <<= 40;
1691 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1692 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the 64-bit unsigned integer format. The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic---which means in particular that the conversion is rounded
1701 | according to the current rounding mode. If `a' is a NaN, the largest
1702 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1703 | largest unsigned integer is returned. If the 'a' is negative, the result
1704 | is rounded and zero is returned; values that do not round to zero will
1705 | raise the inexact exception flag.
1706 *----------------------------------------------------------------------------*/
1708 uint64 float32_to_uint64(float32 a, float_status *status)
1710 flag aSign;
1711 int_fast16_t aExp, shiftCount;
1712 uint32_t aSig;
1713 uint64_t aSig64, aSigExtra;
1714 a = float32_squash_input_denormal(a, status);
1716 aSig = extractFloat32Frac(a);
1717 aExp = extractFloat32Exp(a);
1718 aSign = extractFloat32Sign(a);
1719 if ((aSign) && (aExp > 126)) {
1720 float_raise(float_flag_invalid, status);
1721 if (float32_is_any_nan(a)) {
1722 return LIT64(0xFFFFFFFFFFFFFFFF);
1723 } else {
1724 return 0;
1727 shiftCount = 0xBE - aExp;
1728 if (aExp) {
1729 aSig |= 0x00800000;
1731 if (shiftCount < 0) {
1732 float_raise(float_flag_invalid, status);
1733 return LIT64(0xFFFFFFFFFFFFFFFF);
1736 aSig64 = aSig;
1737 aSig64 <<= 40;
1738 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1739 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1742 /*----------------------------------------------------------------------------
1743 | Returns the result of converting the single-precision floating-point value
1744 | `a' to the 64-bit unsigned integer format. The conversion is
1745 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1746 | Arithmetic, except that the conversion is always rounded toward zero. If
1747 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1748 | conversion overflows, the largest unsigned integer is returned. If the
1749 | 'a' is negative, the result is rounded and zero is returned; values that do
1750 | not round to zero will raise the inexact flag.
1751 *----------------------------------------------------------------------------*/
1753 uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
1755 signed char current_rounding_mode = status->float_rounding_mode;
1756 set_float_rounding_mode(float_round_to_zero, status);
1757 int64_t v = float32_to_uint64(a, status);
1758 set_float_rounding_mode(current_rounding_mode, status);
1759 return v;
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of converting the single-precision floating-point value
1764 | `a' to the 64-bit two's complement integer format. The conversion is
1765 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1766 | Arithmetic, except that the conversion is always rounded toward zero. If
1767 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1768 | conversion overflows, the largest integer with the same sign as `a' is
1769 | returned.
1770 *----------------------------------------------------------------------------*/
1772 int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
1774 flag aSign;
1775 int_fast16_t aExp, shiftCount;
1776 uint32_t aSig;
1777 uint64_t aSig64;
1778 int64 z;
1779 a = float32_squash_input_denormal(a, status);
1781 aSig = extractFloat32Frac( a );
1782 aExp = extractFloat32Exp( a );
1783 aSign = extractFloat32Sign( a );
1784 shiftCount = aExp - 0xBE;
1785 if ( 0 <= shiftCount ) {
1786 if ( float32_val(a) != 0xDF000000 ) {
1787 float_raise(float_flag_invalid, status);
1788 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1789 return LIT64( 0x7FFFFFFFFFFFFFFF );
1792 return (int64_t) LIT64( 0x8000000000000000 );
1794 else if ( aExp <= 0x7E ) {
1795 if (aExp | aSig) {
1796 status->float_exception_flags |= float_flag_inexact;
1798 return 0;
1800 aSig64 = aSig | 0x00800000;
1801 aSig64 <<= 40;
1802 z = aSig64>>( - shiftCount );
1803 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1804 status->float_exception_flags |= float_flag_inexact;
1806 if ( aSign ) z = - z;
1807 return z;
1811 /*----------------------------------------------------------------------------
1812 | Returns the result of converting the single-precision floating-point value
1813 | `a' to the double-precision floating-point format. The conversion is
1814 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1815 | Arithmetic.
1816 *----------------------------------------------------------------------------*/
1818 float64 float32_to_float64(float32 a, float_status *status)
1820 flag aSign;
1821 int_fast16_t aExp;
1822 uint32_t aSig;
1823 a = float32_squash_input_denormal(a, status);
1825 aSig = extractFloat32Frac( a );
1826 aExp = extractFloat32Exp( a );
1827 aSign = extractFloat32Sign( a );
1828 if ( aExp == 0xFF ) {
1829 if (aSig) {
1830 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1832 return packFloat64( aSign, 0x7FF, 0 );
1834 if ( aExp == 0 ) {
1835 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1836 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1837 --aExp;
1839 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of converting the single-precision floating-point value
1845 | `a' to the extended double-precision floating-point format. The conversion
1846 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1847 | Arithmetic.
1848 *----------------------------------------------------------------------------*/
1850 floatx80 float32_to_floatx80(float32 a, float_status *status)
1852 flag aSign;
1853 int_fast16_t aExp;
1854 uint32_t aSig;
1856 a = float32_squash_input_denormal(a, status);
1857 aSig = extractFloat32Frac( a );
1858 aExp = extractFloat32Exp( a );
1859 aSign = extractFloat32Sign( a );
1860 if ( aExp == 0xFF ) {
1861 if (aSig) {
1862 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1864 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1866 if ( aExp == 0 ) {
1867 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1868 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1870 aSig |= 0x00800000;
1871 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the single-precision floating-point value
1877 | `a' to the double-precision floating-point format. The conversion is
1878 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 | Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float128 float32_to_float128(float32 a, float_status *status)
1884 flag aSign;
1885 int_fast16_t aExp;
1886 uint32_t aSig;
1888 a = float32_squash_input_denormal(a, status);
1889 aSig = extractFloat32Frac( a );
1890 aExp = extractFloat32Exp( a );
1891 aSign = extractFloat32Sign( a );
1892 if ( aExp == 0xFF ) {
1893 if (aSig) {
1894 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1896 return packFloat128( aSign, 0x7FFF, 0, 0 );
1898 if ( aExp == 0 ) {
1899 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1900 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1901 --aExp;
1903 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1907 /*----------------------------------------------------------------------------
1908 | Rounds the single-precision floating-point value `a' to an integer, and
1909 | returns the result as a single-precision floating-point value. The
1910 | operation is performed according to the IEC/IEEE Standard for Binary
1911 | Floating-Point Arithmetic.
1912 *----------------------------------------------------------------------------*/
1914 float32 float32_round_to_int(float32 a, float_status *status)
1916 flag aSign;
1917 int_fast16_t aExp;
1918 uint32_t lastBitMask, roundBitsMask;
1919 uint32_t z;
1920 a = float32_squash_input_denormal(a, status);
1922 aExp = extractFloat32Exp( a );
1923 if ( 0x96 <= aExp ) {
1924 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1925 return propagateFloat32NaN(a, a, status);
1927 return a;
1929 if ( aExp <= 0x7E ) {
1930 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1931 status->float_exception_flags |= float_flag_inexact;
1932 aSign = extractFloat32Sign( a );
1933 switch (status->float_rounding_mode) {
1934 case float_round_nearest_even:
1935 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1936 return packFloat32( aSign, 0x7F, 0 );
1938 break;
1939 case float_round_ties_away:
1940 if (aExp == 0x7E) {
1941 return packFloat32(aSign, 0x7F, 0);
1943 break;
1944 case float_round_down:
1945 return make_float32(aSign ? 0xBF800000 : 0);
1946 case float_round_up:
1947 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1949 return packFloat32( aSign, 0, 0 );
1951 lastBitMask = 1;
1952 lastBitMask <<= 0x96 - aExp;
1953 roundBitsMask = lastBitMask - 1;
1954 z = float32_val(a);
1955 switch (status->float_rounding_mode) {
1956 case float_round_nearest_even:
1957 z += lastBitMask>>1;
1958 if ((z & roundBitsMask) == 0) {
1959 z &= ~lastBitMask;
1961 break;
1962 case float_round_ties_away:
1963 z += lastBitMask >> 1;
1964 break;
1965 case float_round_to_zero:
1966 break;
1967 case float_round_up:
1968 if (!extractFloat32Sign(make_float32(z))) {
1969 z += roundBitsMask;
1971 break;
1972 case float_round_down:
1973 if (extractFloat32Sign(make_float32(z))) {
1974 z += roundBitsMask;
1976 break;
1977 default:
1978 abort();
1980 z &= ~ roundBitsMask;
1981 if (z != float32_val(a)) {
1982 status->float_exception_flags |= float_flag_inexact;
1984 return make_float32(z);
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of adding the absolute values of the single-precision
1990 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1991 | before being returned. `zSign' is ignored if the result is a NaN.
1992 | The addition is performed according to the IEC/IEEE Standard for Binary
1993 | Floating-Point Arithmetic.
1994 *----------------------------------------------------------------------------*/
1996 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1997 float_status *status)
1999 int_fast16_t aExp, bExp, zExp;
2000 uint32_t aSig, bSig, zSig;
2001 int_fast16_t expDiff;
2003 aSig = extractFloat32Frac( a );
2004 aExp = extractFloat32Exp( a );
2005 bSig = extractFloat32Frac( b );
2006 bExp = extractFloat32Exp( b );
2007 expDiff = aExp - bExp;
2008 aSig <<= 6;
2009 bSig <<= 6;
2010 if ( 0 < expDiff ) {
2011 if ( aExp == 0xFF ) {
2012 if (aSig) {
2013 return propagateFloat32NaN(a, b, status);
2015 return a;
2017 if ( bExp == 0 ) {
2018 --expDiff;
2020 else {
2021 bSig |= 0x20000000;
2023 shift32RightJamming( bSig, expDiff, &bSig );
2024 zExp = aExp;
2026 else if ( expDiff < 0 ) {
2027 if ( bExp == 0xFF ) {
2028 if (bSig) {
2029 return propagateFloat32NaN(a, b, status);
2031 return packFloat32( zSign, 0xFF, 0 );
2033 if ( aExp == 0 ) {
2034 ++expDiff;
2036 else {
2037 aSig |= 0x20000000;
2039 shift32RightJamming( aSig, - expDiff, &aSig );
2040 zExp = bExp;
2042 else {
2043 if ( aExp == 0xFF ) {
2044 if (aSig | bSig) {
2045 return propagateFloat32NaN(a, b, status);
2047 return a;
2049 if ( aExp == 0 ) {
2050 if (status->flush_to_zero) {
2051 if (aSig | bSig) {
2052 float_raise(float_flag_output_denormal, status);
2054 return packFloat32(zSign, 0, 0);
2056 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2058 zSig = 0x40000000 + aSig + bSig;
2059 zExp = aExp;
2060 goto roundAndPack;
2062 aSig |= 0x20000000;
2063 zSig = ( aSig + bSig )<<1;
2064 --zExp;
2065 if ( (int32_t) zSig < 0 ) {
2066 zSig = aSig + bSig;
2067 ++zExp;
2069 roundAndPack:
2070 return roundAndPackFloat32(zSign, zExp, zSig, status);
2074 /*----------------------------------------------------------------------------
2075 | Returns the result of subtracting the absolute values of the single-
2076 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2077 | difference is negated before being returned. `zSign' is ignored if the
2078 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2079 | Standard for Binary Floating-Point Arithmetic.
2080 *----------------------------------------------------------------------------*/
2082 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2083 float_status *status)
2085 int_fast16_t aExp, bExp, zExp;
2086 uint32_t aSig, bSig, zSig;
2087 int_fast16_t expDiff;
2089 aSig = extractFloat32Frac( a );
2090 aExp = extractFloat32Exp( a );
2091 bSig = extractFloat32Frac( b );
2092 bExp = extractFloat32Exp( b );
2093 expDiff = aExp - bExp;
2094 aSig <<= 7;
2095 bSig <<= 7;
2096 if ( 0 < expDiff ) goto aExpBigger;
2097 if ( expDiff < 0 ) goto bExpBigger;
2098 if ( aExp == 0xFF ) {
2099 if (aSig | bSig) {
2100 return propagateFloat32NaN(a, b, status);
2102 float_raise(float_flag_invalid, status);
2103 return float32_default_nan;
2105 if ( aExp == 0 ) {
2106 aExp = 1;
2107 bExp = 1;
2109 if ( bSig < aSig ) goto aBigger;
2110 if ( aSig < bSig ) goto bBigger;
2111 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2112 bExpBigger:
2113 if ( bExp == 0xFF ) {
2114 if (bSig) {
2115 return propagateFloat32NaN(a, b, status);
2117 return packFloat32( zSign ^ 1, 0xFF, 0 );
2119 if ( aExp == 0 ) {
2120 ++expDiff;
2122 else {
2123 aSig |= 0x40000000;
2125 shift32RightJamming( aSig, - expDiff, &aSig );
2126 bSig |= 0x40000000;
2127 bBigger:
2128 zSig = bSig - aSig;
2129 zExp = bExp;
2130 zSign ^= 1;
2131 goto normalizeRoundAndPack;
2132 aExpBigger:
2133 if ( aExp == 0xFF ) {
2134 if (aSig) {
2135 return propagateFloat32NaN(a, b, status);
2137 return a;
2139 if ( bExp == 0 ) {
2140 --expDiff;
2142 else {
2143 bSig |= 0x40000000;
2145 shift32RightJamming( bSig, expDiff, &bSig );
2146 aSig |= 0x40000000;
2147 aBigger:
2148 zSig = aSig - bSig;
2149 zExp = aExp;
2150 normalizeRoundAndPack:
2151 --zExp;
2152 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of adding the single-precision floating-point values `a'
2158 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2159 | Binary Floating-Point Arithmetic.
2160 *----------------------------------------------------------------------------*/
2162 float32 float32_add(float32 a, float32 b, float_status *status)
2164 flag aSign, bSign;
2165 a = float32_squash_input_denormal(a, status);
2166 b = float32_squash_input_denormal(b, status);
2168 aSign = extractFloat32Sign( a );
2169 bSign = extractFloat32Sign( b );
2170 if ( aSign == bSign ) {
2171 return addFloat32Sigs(a, b, aSign, status);
2173 else {
2174 return subFloat32Sigs(a, b, aSign, status);
2179 /*----------------------------------------------------------------------------
2180 | Returns the result of subtracting the single-precision floating-point values
2181 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2182 | for Binary Floating-Point Arithmetic.
2183 *----------------------------------------------------------------------------*/
2185 float32 float32_sub(float32 a, float32 b, float_status *status)
2187 flag aSign, bSign;
2188 a = float32_squash_input_denormal(a, status);
2189 b = float32_squash_input_denormal(b, status);
2191 aSign = extractFloat32Sign( a );
2192 bSign = extractFloat32Sign( b );
2193 if ( aSign == bSign ) {
2194 return subFloat32Sigs(a, b, aSign, status);
2196 else {
2197 return addFloat32Sigs(a, b, aSign, status);
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of multiplying the single-precision floating-point values
2204 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2205 | for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2208 float32 float32_mul(float32 a, float32 b, float_status *status)
2210 flag aSign, bSign, zSign;
2211 int_fast16_t aExp, bExp, zExp;
2212 uint32_t aSig, bSig;
2213 uint64_t zSig64;
2214 uint32_t zSig;
2216 a = float32_squash_input_denormal(a, status);
2217 b = float32_squash_input_denormal(b, status);
2219 aSig = extractFloat32Frac( a );
2220 aExp = extractFloat32Exp( a );
2221 aSign = extractFloat32Sign( a );
2222 bSig = extractFloat32Frac( b );
2223 bExp = extractFloat32Exp( b );
2224 bSign = extractFloat32Sign( b );
2225 zSign = aSign ^ bSign;
2226 if ( aExp == 0xFF ) {
2227 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2228 return propagateFloat32NaN(a, b, status);
2230 if ( ( bExp | bSig ) == 0 ) {
2231 float_raise(float_flag_invalid, status);
2232 return float32_default_nan;
2234 return packFloat32( zSign, 0xFF, 0 );
2236 if ( bExp == 0xFF ) {
2237 if (bSig) {
2238 return propagateFloat32NaN(a, b, status);
2240 if ( ( aExp | aSig ) == 0 ) {
2241 float_raise(float_flag_invalid, status);
2242 return float32_default_nan;
2244 return packFloat32( zSign, 0xFF, 0 );
2246 if ( aExp == 0 ) {
2247 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2248 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2250 if ( bExp == 0 ) {
2251 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2252 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2254 zExp = aExp + bExp - 0x7F;
2255 aSig = ( aSig | 0x00800000 )<<7;
2256 bSig = ( bSig | 0x00800000 )<<8;
2257 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2258 zSig = zSig64;
2259 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2260 zSig <<= 1;
2261 --zExp;
2263 return roundAndPackFloat32(zSign, zExp, zSig, status);
2267 /*----------------------------------------------------------------------------
2268 | Returns the result of dividing the single-precision floating-point value `a'
2269 | by the corresponding value `b'. The operation is performed according to the
2270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2273 float32 float32_div(float32 a, float32 b, float_status *status)
2275 flag aSign, bSign, zSign;
2276 int_fast16_t aExp, bExp, zExp;
2277 uint32_t aSig, bSig, zSig;
2278 a = float32_squash_input_denormal(a, status);
2279 b = float32_squash_input_denormal(b, status);
2281 aSig = extractFloat32Frac( a );
2282 aExp = extractFloat32Exp( a );
2283 aSign = extractFloat32Sign( a );
2284 bSig = extractFloat32Frac( b );
2285 bExp = extractFloat32Exp( b );
2286 bSign = extractFloat32Sign( b );
2287 zSign = aSign ^ bSign;
2288 if ( aExp == 0xFF ) {
2289 if (aSig) {
2290 return propagateFloat32NaN(a, b, status);
2292 if ( bExp == 0xFF ) {
2293 if (bSig) {
2294 return propagateFloat32NaN(a, b, status);
2296 float_raise(float_flag_invalid, status);
2297 return float32_default_nan;
2299 return packFloat32( zSign, 0xFF, 0 );
2301 if ( bExp == 0xFF ) {
2302 if (bSig) {
2303 return propagateFloat32NaN(a, b, status);
2305 return packFloat32( zSign, 0, 0 );
2307 if ( bExp == 0 ) {
2308 if ( bSig == 0 ) {
2309 if ( ( aExp | aSig ) == 0 ) {
2310 float_raise(float_flag_invalid, status);
2311 return float32_default_nan;
2313 float_raise(float_flag_divbyzero, status);
2314 return packFloat32( zSign, 0xFF, 0 );
2316 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2318 if ( aExp == 0 ) {
2319 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2320 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2322 zExp = aExp - bExp + 0x7D;
2323 aSig = ( aSig | 0x00800000 )<<7;
2324 bSig = ( bSig | 0x00800000 )<<8;
2325 if ( bSig <= ( aSig + aSig ) ) {
2326 aSig >>= 1;
2327 ++zExp;
2329 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2330 if ( ( zSig & 0x3F ) == 0 ) {
2331 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2333 return roundAndPackFloat32(zSign, zExp, zSig, status);
2337 /*----------------------------------------------------------------------------
2338 | Returns the remainder of the single-precision floating-point value `a'
2339 | with respect to the corresponding value `b'. The operation is performed
2340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2341 *----------------------------------------------------------------------------*/
2343 float32 float32_rem(float32 a, float32 b, float_status *status)
2345 flag aSign, zSign;
2346 int_fast16_t aExp, bExp, expDiff;
2347 uint32_t aSig, bSig;
2348 uint32_t q;
2349 uint64_t aSig64, bSig64, q64;
2350 uint32_t alternateASig;
2351 int32_t sigMean;
2352 a = float32_squash_input_denormal(a, status);
2353 b = float32_squash_input_denormal(b, status);
2355 aSig = extractFloat32Frac( a );
2356 aExp = extractFloat32Exp( a );
2357 aSign = extractFloat32Sign( a );
2358 bSig = extractFloat32Frac( b );
2359 bExp = extractFloat32Exp( b );
2360 if ( aExp == 0xFF ) {
2361 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2362 return propagateFloat32NaN(a, b, status);
2364 float_raise(float_flag_invalid, status);
2365 return float32_default_nan;
2367 if ( bExp == 0xFF ) {
2368 if (bSig) {
2369 return propagateFloat32NaN(a, b, status);
2371 return a;
2373 if ( bExp == 0 ) {
2374 if ( bSig == 0 ) {
2375 float_raise(float_flag_invalid, status);
2376 return float32_default_nan;
2378 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2380 if ( aExp == 0 ) {
2381 if ( aSig == 0 ) return a;
2382 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2384 expDiff = aExp - bExp;
2385 aSig |= 0x00800000;
2386 bSig |= 0x00800000;
2387 if ( expDiff < 32 ) {
2388 aSig <<= 8;
2389 bSig <<= 8;
2390 if ( expDiff < 0 ) {
2391 if ( expDiff < -1 ) return a;
2392 aSig >>= 1;
2394 q = ( bSig <= aSig );
2395 if ( q ) aSig -= bSig;
2396 if ( 0 < expDiff ) {
2397 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2398 q >>= 32 - expDiff;
2399 bSig >>= 2;
2400 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2402 else {
2403 aSig >>= 2;
2404 bSig >>= 2;
2407 else {
2408 if ( bSig <= aSig ) aSig -= bSig;
2409 aSig64 = ( (uint64_t) aSig )<<40;
2410 bSig64 = ( (uint64_t) bSig )<<40;
2411 expDiff -= 64;
2412 while ( 0 < expDiff ) {
2413 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2414 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2415 aSig64 = - ( ( bSig * q64 )<<38 );
2416 expDiff -= 62;
2418 expDiff += 64;
2419 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2420 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2421 q = q64>>( 64 - expDiff );
2422 bSig <<= 6;
2423 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2425 do {
2426 alternateASig = aSig;
2427 ++q;
2428 aSig -= bSig;
2429 } while ( 0 <= (int32_t) aSig );
2430 sigMean = aSig + alternateASig;
2431 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2432 aSig = alternateASig;
2434 zSign = ( (int32_t) aSig < 0 );
2435 if ( zSign ) aSig = - aSig;
2436 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2439 /*----------------------------------------------------------------------------
2440 | Returns the result of multiplying the single-precision floating-point values
2441 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2442 | multiplication. The operation is performed according to the IEC/IEEE
2443 | Standard for Binary Floating-Point Arithmetic 754-2008.
2444 | The flags argument allows the caller to select negation of the
2445 | addend, the intermediate product, or the final result. (The difference
2446 | between this and having the caller do a separate negation is that negating
2447 | externally will flip the sign bit on NaNs.)
2448 *----------------------------------------------------------------------------*/
2450 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2451 float_status *status)
2453 flag aSign, bSign, cSign, zSign;
2454 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2455 uint32_t aSig, bSig, cSig;
2456 flag pInf, pZero, pSign;
2457 uint64_t pSig64, cSig64, zSig64;
2458 uint32_t pSig;
2459 int shiftcount;
2460 flag signflip, infzero;
2462 a = float32_squash_input_denormal(a, status);
2463 b = float32_squash_input_denormal(b, status);
2464 c = float32_squash_input_denormal(c, status);
2465 aSig = extractFloat32Frac(a);
2466 aExp = extractFloat32Exp(a);
2467 aSign = extractFloat32Sign(a);
2468 bSig = extractFloat32Frac(b);
2469 bExp = extractFloat32Exp(b);
2470 bSign = extractFloat32Sign(b);
2471 cSig = extractFloat32Frac(c);
2472 cExp = extractFloat32Exp(c);
2473 cSign = extractFloat32Sign(c);
2475 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2476 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2478 /* It is implementation-defined whether the cases of (0,inf,qnan)
2479 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2480 * they return if they do), so we have to hand this information
2481 * off to the target-specific pick-a-NaN routine.
2483 if (((aExp == 0xff) && aSig) ||
2484 ((bExp == 0xff) && bSig) ||
2485 ((cExp == 0xff) && cSig)) {
2486 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2489 if (infzero) {
2490 float_raise(float_flag_invalid, status);
2491 return float32_default_nan;
2494 if (flags & float_muladd_negate_c) {
2495 cSign ^= 1;
2498 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2500 /* Work out the sign and type of the product */
2501 pSign = aSign ^ bSign;
2502 if (flags & float_muladd_negate_product) {
2503 pSign ^= 1;
2505 pInf = (aExp == 0xff) || (bExp == 0xff);
2506 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2508 if (cExp == 0xff) {
2509 if (pInf && (pSign ^ cSign)) {
2510 /* addition of opposite-signed infinities => InvalidOperation */
2511 float_raise(float_flag_invalid, status);
2512 return float32_default_nan;
2514 /* Otherwise generate an infinity of the same sign */
2515 return packFloat32(cSign ^ signflip, 0xff, 0);
2518 if (pInf) {
2519 return packFloat32(pSign ^ signflip, 0xff, 0);
2522 if (pZero) {
2523 if (cExp == 0) {
2524 if (cSig == 0) {
2525 /* Adding two exact zeroes */
2526 if (pSign == cSign) {
2527 zSign = pSign;
2528 } else if (status->float_rounding_mode == float_round_down) {
2529 zSign = 1;
2530 } else {
2531 zSign = 0;
2533 return packFloat32(zSign ^ signflip, 0, 0);
2535 /* Exact zero plus a denorm */
2536 if (status->flush_to_zero) {
2537 float_raise(float_flag_output_denormal, status);
2538 return packFloat32(cSign ^ signflip, 0, 0);
2541 /* Zero plus something non-zero : just return the something */
2542 if (flags & float_muladd_halve_result) {
2543 if (cExp == 0) {
2544 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2546 /* Subtract one to halve, and one again because roundAndPackFloat32
2547 * wants one less than the true exponent.
2549 cExp -= 2;
2550 cSig = (cSig | 0x00800000) << 7;
2551 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2553 return packFloat32(cSign ^ signflip, cExp, cSig);
2556 if (aExp == 0) {
2557 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2559 if (bExp == 0) {
2560 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2563 /* Calculate the actual result a * b + c */
2565 /* Multiply first; this is easy. */
2566 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2567 * because we want the true exponent, not the "one-less-than"
2568 * flavour that roundAndPackFloat32() takes.
2570 pExp = aExp + bExp - 0x7e;
2571 aSig = (aSig | 0x00800000) << 7;
2572 bSig = (bSig | 0x00800000) << 8;
2573 pSig64 = (uint64_t)aSig * bSig;
2574 if ((int64_t)(pSig64 << 1) >= 0) {
2575 pSig64 <<= 1;
2576 pExp--;
2579 zSign = pSign ^ signflip;
2581 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2582 * position 62.
2584 if (cExp == 0) {
2585 if (!cSig) {
2586 /* Throw out the special case of c being an exact zero now */
2587 shift64RightJamming(pSig64, 32, &pSig64);
2588 pSig = pSig64;
2589 if (flags & float_muladd_halve_result) {
2590 pExp--;
2592 return roundAndPackFloat32(zSign, pExp - 1,
2593 pSig, status);
2595 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2598 cSig64 = (uint64_t)cSig << (62 - 23);
2599 cSig64 |= LIT64(0x4000000000000000);
2600 expDiff = pExp - cExp;
2602 if (pSign == cSign) {
2603 /* Addition */
2604 if (expDiff > 0) {
2605 /* scale c to match p */
2606 shift64RightJamming(cSig64, expDiff, &cSig64);
2607 zExp = pExp;
2608 } else if (expDiff < 0) {
2609 /* scale p to match c */
2610 shift64RightJamming(pSig64, -expDiff, &pSig64);
2611 zExp = cExp;
2612 } else {
2613 /* no scaling needed */
2614 zExp = cExp;
2616 /* Add significands and make sure explicit bit ends up in posn 62 */
2617 zSig64 = pSig64 + cSig64;
2618 if ((int64_t)zSig64 < 0) {
2619 shift64RightJamming(zSig64, 1, &zSig64);
2620 } else {
2621 zExp--;
2623 } else {
2624 /* Subtraction */
2625 if (expDiff > 0) {
2626 shift64RightJamming(cSig64, expDiff, &cSig64);
2627 zSig64 = pSig64 - cSig64;
2628 zExp = pExp;
2629 } else if (expDiff < 0) {
2630 shift64RightJamming(pSig64, -expDiff, &pSig64);
2631 zSig64 = cSig64 - pSig64;
2632 zExp = cExp;
2633 zSign ^= 1;
2634 } else {
2635 zExp = pExp;
2636 if (cSig64 < pSig64) {
2637 zSig64 = pSig64 - cSig64;
2638 } else if (pSig64 < cSig64) {
2639 zSig64 = cSig64 - pSig64;
2640 zSign ^= 1;
2641 } else {
2642 /* Exact zero */
2643 zSign = signflip;
2644 if (status->float_rounding_mode == float_round_down) {
2645 zSign ^= 1;
2647 return packFloat32(zSign, 0, 0);
2650 --zExp;
2651 /* Normalize to put the explicit bit back into bit 62. */
2652 shiftcount = countLeadingZeros64(zSig64) - 1;
2653 zSig64 <<= shiftcount;
2654 zExp -= shiftcount;
2656 if (flags & float_muladd_halve_result) {
2657 zExp--;
2660 shift64RightJamming(zSig64, 32, &zSig64);
2661 return roundAndPackFloat32(zSign, zExp, zSig64, status);
2665 /*----------------------------------------------------------------------------
2666 | Returns the square root of the single-precision floating-point value `a'.
2667 | The operation is performed according to the IEC/IEEE Standard for Binary
2668 | Floating-Point Arithmetic.
2669 *----------------------------------------------------------------------------*/
2671 float32 float32_sqrt(float32 a, float_status *status)
2673 flag aSign;
2674 int_fast16_t aExp, zExp;
2675 uint32_t aSig, zSig;
2676 uint64_t rem, term;
2677 a = float32_squash_input_denormal(a, status);
2679 aSig = extractFloat32Frac( a );
2680 aExp = extractFloat32Exp( a );
2681 aSign = extractFloat32Sign( a );
2682 if ( aExp == 0xFF ) {
2683 if (aSig) {
2684 return propagateFloat32NaN(a, float32_zero, status);
2686 if ( ! aSign ) return a;
2687 float_raise(float_flag_invalid, status);
2688 return float32_default_nan;
2690 if ( aSign ) {
2691 if ( ( aExp | aSig ) == 0 ) return a;
2692 float_raise(float_flag_invalid, status);
2693 return float32_default_nan;
2695 if ( aExp == 0 ) {
2696 if ( aSig == 0 ) return float32_zero;
2697 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2699 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2700 aSig = ( aSig | 0x00800000 )<<8;
2701 zSig = estimateSqrt32( aExp, aSig ) + 2;
2702 if ( ( zSig & 0x7F ) <= 5 ) {
2703 if ( zSig < 2 ) {
2704 zSig = 0x7FFFFFFF;
2705 goto roundAndPack;
2707 aSig >>= aExp & 1;
2708 term = ( (uint64_t) zSig ) * zSig;
2709 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2710 while ( (int64_t) rem < 0 ) {
2711 --zSig;
2712 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2714 zSig |= ( rem != 0 );
2716 shift32RightJamming( zSig, 1, &zSig );
2717 roundAndPack:
2718 return roundAndPackFloat32(0, zExp, zSig, status);
2722 /*----------------------------------------------------------------------------
2723 | Returns the binary exponential of the single-precision floating-point value
2724 | `a'. The operation is performed according to the IEC/IEEE Standard for
2725 | Binary Floating-Point Arithmetic.
2727 | Uses the following identities:
2729 | 1. -------------------------------------------------------------------------
2730 | x x*ln(2)
2731 | 2 = e
2733 | 2. -------------------------------------------------------------------------
2734 | 2 3 4 5 n
2735 | x x x x x x x
2736 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2737 | 1! 2! 3! 4! 5! n!
2738 *----------------------------------------------------------------------------*/
2740 static const float64 float32_exp2_coefficients[15] =
2742 const_float64( 0x3ff0000000000000ll ), /* 1 */
2743 const_float64( 0x3fe0000000000000ll ), /* 2 */
2744 const_float64( 0x3fc5555555555555ll ), /* 3 */
2745 const_float64( 0x3fa5555555555555ll ), /* 4 */
2746 const_float64( 0x3f81111111111111ll ), /* 5 */
2747 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2748 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2749 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2750 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2751 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2752 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2753 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2754 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2755 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2756 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2759 float32 float32_exp2(float32 a, float_status *status)
2761 flag aSign;
2762 int_fast16_t aExp;
2763 uint32_t aSig;
2764 float64 r, x, xn;
2765 int i;
2766 a = float32_squash_input_denormal(a, status);
2768 aSig = extractFloat32Frac( a );
2769 aExp = extractFloat32Exp( a );
2770 aSign = extractFloat32Sign( a );
2772 if ( aExp == 0xFF) {
2773 if (aSig) {
2774 return propagateFloat32NaN(a, float32_zero, status);
2776 return (aSign) ? float32_zero : a;
2778 if (aExp == 0) {
2779 if (aSig == 0) return float32_one;
2782 float_raise(float_flag_inexact, status);
2784 /* ******************************* */
2785 /* using float64 for approximation */
2786 /* ******************************* */
2787 x = float32_to_float64(a, status);
2788 x = float64_mul(x, float64_ln2, status);
2790 xn = x;
2791 r = float64_one;
2792 for (i = 0 ; i < 15 ; i++) {
2793 float64 f;
2795 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2796 r = float64_add(r, f, status);
2798 xn = float64_mul(xn, x, status);
2801 return float64_to_float32(r, status);
2804 /*----------------------------------------------------------------------------
2805 | Returns the binary log of the single-precision floating-point value `a'.
2806 | The operation is performed according to the IEC/IEEE Standard for Binary
2807 | Floating-Point Arithmetic.
2808 *----------------------------------------------------------------------------*/
2809 float32 float32_log2(float32 a, float_status *status)
2811 flag aSign, zSign;
2812 int_fast16_t aExp;
2813 uint32_t aSig, zSig, i;
2815 a = float32_squash_input_denormal(a, status);
2816 aSig = extractFloat32Frac( a );
2817 aExp = extractFloat32Exp( a );
2818 aSign = extractFloat32Sign( a );
2820 if ( aExp == 0 ) {
2821 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2822 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2824 if ( aSign ) {
2825 float_raise(float_flag_invalid, status);
2826 return float32_default_nan;
2828 if ( aExp == 0xFF ) {
2829 if (aSig) {
2830 return propagateFloat32NaN(a, float32_zero, status);
2832 return a;
2835 aExp -= 0x7F;
2836 aSig |= 0x00800000;
2837 zSign = aExp < 0;
2838 zSig = aExp << 23;
2840 for (i = 1 << 22; i > 0; i >>= 1) {
2841 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2842 if ( aSig & 0x01000000 ) {
2843 aSig >>= 1;
2844 zSig |= i;
2848 if ( zSign )
2849 zSig = -zSig;
2851 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2854 /*----------------------------------------------------------------------------
2855 | Returns 1 if the single-precision floating-point value `a' is equal to
2856 | the corresponding value `b', and 0 otherwise. The invalid exception is
2857 | raised if either operand is a NaN. Otherwise, the comparison is performed
2858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2859 *----------------------------------------------------------------------------*/
2861 int float32_eq(float32 a, float32 b, float_status *status)
2863 uint32_t av, bv;
2864 a = float32_squash_input_denormal(a, status);
2865 b = float32_squash_input_denormal(b, status);
2867 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2868 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2870 float_raise(float_flag_invalid, status);
2871 return 0;
2873 av = float32_val(a);
2874 bv = float32_val(b);
2875 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is less than
2880 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2881 | exception is raised if either operand is a NaN. The comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 int float32_le(float32 a, float32 b, float_status *status)
2887 flag aSign, bSign;
2888 uint32_t av, bv;
2889 a = float32_squash_input_denormal(a, status);
2890 b = float32_squash_input_denormal(b, status);
2892 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2895 float_raise(float_flag_invalid, status);
2896 return 0;
2898 aSign = extractFloat32Sign( a );
2899 bSign = extractFloat32Sign( b );
2900 av = float32_val(a);
2901 bv = float32_val(b);
2902 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2903 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2907 /*----------------------------------------------------------------------------
2908 | Returns 1 if the single-precision floating-point value `a' is less than
2909 | the corresponding value `b', and 0 otherwise. The invalid exception is
2910 | raised if either operand is a NaN. The comparison is performed according
2911 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2912 *----------------------------------------------------------------------------*/
2914 int float32_lt(float32 a, float32 b, float_status *status)
2916 flag aSign, bSign;
2917 uint32_t av, bv;
2918 a = float32_squash_input_denormal(a, status);
2919 b = float32_squash_input_denormal(b, status);
2921 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2922 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2924 float_raise(float_flag_invalid, status);
2925 return 0;
2927 aSign = extractFloat32Sign( a );
2928 bSign = extractFloat32Sign( b );
2929 av = float32_val(a);
2930 bv = float32_val(b);
2931 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2932 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2936 /*----------------------------------------------------------------------------
2937 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2938 | be compared, and 0 otherwise. The invalid exception is raised if either
2939 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2940 | Standard for Binary Floating-Point Arithmetic.
2941 *----------------------------------------------------------------------------*/
2943 int float32_unordered(float32 a, float32 b, float_status *status)
2945 a = float32_squash_input_denormal(a, status);
2946 b = float32_squash_input_denormal(b, status);
2948 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2949 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2951 float_raise(float_flag_invalid, status);
2952 return 1;
2954 return 0;
2957 /*----------------------------------------------------------------------------
2958 | Returns 1 if the single-precision floating-point value `a' is equal to
2959 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2960 | exception. The comparison is performed according to the IEC/IEEE Standard
2961 | for Binary Floating-Point Arithmetic.
2962 *----------------------------------------------------------------------------*/
2964 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2966 a = float32_squash_input_denormal(a, status);
2967 b = float32_squash_input_denormal(b, status);
2969 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2970 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2972 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2973 float_raise(float_flag_invalid, status);
2975 return 0;
2977 return ( float32_val(a) == float32_val(b) ) ||
2978 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is less than or
2983 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2984 | cause an exception. Otherwise, the comparison is performed according to the
2985 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2988 int float32_le_quiet(float32 a, float32 b, float_status *status)
2990 flag aSign, bSign;
2991 uint32_t av, bv;
2992 a = float32_squash_input_denormal(a, status);
2993 b = float32_squash_input_denormal(b, status);
2995 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2996 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2998 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2999 float_raise(float_flag_invalid, status);
3001 return 0;
3003 aSign = extractFloat32Sign( a );
3004 bSign = extractFloat32Sign( b );
3005 av = float32_val(a);
3006 bv = float32_val(b);
3007 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3008 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3012 /*----------------------------------------------------------------------------
3013 | Returns 1 if the single-precision floating-point value `a' is less than
3014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3016 | Standard for Binary Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3019 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3021 flag aSign, bSign;
3022 uint32_t av, bv;
3023 a = float32_squash_input_denormal(a, status);
3024 b = float32_squash_input_denormal(b, status);
3026 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3029 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3030 float_raise(float_flag_invalid, status);
3032 return 0;
3034 aSign = extractFloat32Sign( a );
3035 bSign = extractFloat32Sign( b );
3036 av = float32_val(a);
3037 bv = float32_val(b);
3038 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3039 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3043 /*----------------------------------------------------------------------------
3044 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3045 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3046 | comparison is performed according to the IEC/IEEE Standard for Binary
3047 | Floating-Point Arithmetic.
3048 *----------------------------------------------------------------------------*/
3050 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3052 a = float32_squash_input_denormal(a, status);
3053 b = float32_squash_input_denormal(b, status);
3055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3058 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3059 float_raise(float_flag_invalid, status);
3061 return 1;
3063 return 0;
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of converting the double-precision floating-point value
3068 | `a' to the 32-bit two's complement integer format. The conversion is
3069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3070 | Arithmetic---which means in particular that the conversion is rounded
3071 | according to the current rounding mode. If `a' is a NaN, the largest
3072 | positive integer is returned. Otherwise, if the conversion overflows, the
3073 | largest integer with the same sign as `a' is returned.
3074 *----------------------------------------------------------------------------*/
3076 int32 float64_to_int32(float64 a, float_status *status)
3078 flag aSign;
3079 int_fast16_t aExp, shiftCount;
3080 uint64_t aSig;
3081 a = float64_squash_input_denormal(a, status);
3083 aSig = extractFloat64Frac( a );
3084 aExp = extractFloat64Exp( a );
3085 aSign = extractFloat64Sign( a );
3086 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3087 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088 shiftCount = 0x42C - aExp;
3089 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3090 return roundAndPackInt32(aSign, aSig, status);
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format. The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic, except that the conversion is always rounded toward zero.
3099 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3100 | the conversion overflows, the largest integer with the same sign as `a' is
3101 | returned.
3102 *----------------------------------------------------------------------------*/
3104 int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
3106 flag aSign;
3107 int_fast16_t aExp, shiftCount;
3108 uint64_t aSig, savedASig;
3109 int32_t z;
3110 a = float64_squash_input_denormal(a, status);
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 aSign = extractFloat64Sign( a );
3115 if ( 0x41E < aExp ) {
3116 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3117 goto invalid;
3119 else if ( aExp < 0x3FF ) {
3120 if (aExp || aSig) {
3121 status->float_exception_flags |= float_flag_inexact;
3123 return 0;
3125 aSig |= LIT64( 0x0010000000000000 );
3126 shiftCount = 0x433 - aExp;
3127 savedASig = aSig;
3128 aSig >>= shiftCount;
3129 z = aSig;
3130 if ( aSign ) z = - z;
3131 if ( ( z < 0 ) ^ aSign ) {
3132 invalid:
3133 float_raise(float_flag_invalid, status);
3134 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3136 if ( ( aSig<<shiftCount ) != savedASig ) {
3137 status->float_exception_flags |= float_flag_inexact;
3139 return z;
3143 /*----------------------------------------------------------------------------
3144 | Returns the result of converting the double-precision floating-point value
3145 | `a' to the 16-bit two's complement integer format. The conversion is
3146 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3147 | Arithmetic, except that the conversion is always rounded toward zero.
3148 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3149 | the conversion overflows, the largest integer with the same sign as `a' is
3150 | returned.
3151 *----------------------------------------------------------------------------*/
3153 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3155 flag aSign;
3156 int_fast16_t aExp, shiftCount;
3157 uint64_t aSig, savedASig;
3158 int32 z;
3160 aSig = extractFloat64Frac( a );
3161 aExp = extractFloat64Exp( a );
3162 aSign = extractFloat64Sign( a );
3163 if ( 0x40E < aExp ) {
3164 if ( ( aExp == 0x7FF ) && aSig ) {
3165 aSign = 0;
3167 goto invalid;
3169 else if ( aExp < 0x3FF ) {
3170 if ( aExp || aSig ) {
3171 status->float_exception_flags |= float_flag_inexact;
3173 return 0;
3175 aSig |= LIT64( 0x0010000000000000 );
3176 shiftCount = 0x433 - aExp;
3177 savedASig = aSig;
3178 aSig >>= shiftCount;
3179 z = aSig;
3180 if ( aSign ) {
3181 z = - z;
3183 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3184 invalid:
3185 float_raise(float_flag_invalid, status);
3186 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3188 if ( ( aSig<<shiftCount ) != savedASig ) {
3189 status->float_exception_flags |= float_flag_inexact;
3191 return z;
3194 /*----------------------------------------------------------------------------
3195 | Returns the result of converting the double-precision floating-point value
3196 | `a' to the 64-bit two's complement integer format. The conversion is
3197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3198 | Arithmetic---which means in particular that the conversion is rounded
3199 | according to the current rounding mode. If `a' is a NaN, the largest
3200 | positive integer is returned. Otherwise, if the conversion overflows, the
3201 | largest integer with the same sign as `a' is returned.
3202 *----------------------------------------------------------------------------*/
3204 int64 float64_to_int64(float64 a, float_status *status)
3206 flag aSign;
3207 int_fast16_t aExp, shiftCount;
3208 uint64_t aSig, aSigExtra;
3209 a = float64_squash_input_denormal(a, status);
3211 aSig = extractFloat64Frac( a );
3212 aExp = extractFloat64Exp( a );
3213 aSign = extractFloat64Sign( a );
3214 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3215 shiftCount = 0x433 - aExp;
3216 if ( shiftCount <= 0 ) {
3217 if ( 0x43E < aExp ) {
3218 float_raise(float_flag_invalid, status);
3219 if ( ! aSign
3220 || ( ( aExp == 0x7FF )
3221 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3223 return LIT64( 0x7FFFFFFFFFFFFFFF );
3225 return (int64_t) LIT64( 0x8000000000000000 );
3227 aSigExtra = 0;
3228 aSig <<= - shiftCount;
3230 else {
3231 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3233 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3237 /*----------------------------------------------------------------------------
3238 | Returns the result of converting the double-precision floating-point value
3239 | `a' to the 64-bit two's complement integer format. The conversion is
3240 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3241 | Arithmetic, except that the conversion is always rounded toward zero.
3242 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3243 | the conversion overflows, the largest integer with the same sign as `a' is
3244 | returned.
3245 *----------------------------------------------------------------------------*/
3247 int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
3249 flag aSign;
3250 int_fast16_t aExp, shiftCount;
3251 uint64_t aSig;
3252 int64 z;
3253 a = float64_squash_input_denormal(a, status);
3255 aSig = extractFloat64Frac( a );
3256 aExp = extractFloat64Exp( a );
3257 aSign = extractFloat64Sign( a );
3258 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3259 shiftCount = aExp - 0x433;
3260 if ( 0 <= shiftCount ) {
3261 if ( 0x43E <= aExp ) {
3262 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3263 float_raise(float_flag_invalid, status);
3264 if ( ! aSign
3265 || ( ( aExp == 0x7FF )
3266 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3268 return LIT64( 0x7FFFFFFFFFFFFFFF );
3271 return (int64_t) LIT64( 0x8000000000000000 );
3273 z = aSig<<shiftCount;
3275 else {
3276 if ( aExp < 0x3FE ) {
3277 if (aExp | aSig) {
3278 status->float_exception_flags |= float_flag_inexact;
3280 return 0;
3282 z = aSig>>( - shiftCount );
3283 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3284 status->float_exception_flags |= float_flag_inexact;
3287 if ( aSign ) z = - z;
3288 return z;
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the double-precision floating-point value
3294 | `a' to the single-precision floating-point format. The conversion is
3295 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3296 | Arithmetic.
3297 *----------------------------------------------------------------------------*/
3299 float32 float64_to_float32(float64 a, float_status *status)
3301 flag aSign;
3302 int_fast16_t aExp;
3303 uint64_t aSig;
3304 uint32_t zSig;
3305 a = float64_squash_input_denormal(a, status);
3307 aSig = extractFloat64Frac( a );
3308 aExp = extractFloat64Exp( a );
3309 aSign = extractFloat64Sign( a );
3310 if ( aExp == 0x7FF ) {
3311 if (aSig) {
3312 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3314 return packFloat32( aSign, 0xFF, 0 );
3316 shift64RightJamming( aSig, 22, &aSig );
3317 zSig = aSig;
3318 if ( aExp || zSig ) {
3319 zSig |= 0x40000000;
3320 aExp -= 0x381;
3322 return roundAndPackFloat32(aSign, aExp, zSig, status);
3327 /*----------------------------------------------------------------------------
3328 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3329 | half-precision floating-point value, returning the result. After being
3330 | shifted into the proper positions, the three fields are simply added
3331 | together to form the result. This means that any integer portion of `zSig'
3332 | will be added into the exponent. Since a properly normalized significand
3333 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3334 | than the desired result exponent whenever `zSig' is a complete, normalized
3335 | significand.
3336 *----------------------------------------------------------------------------*/
3337 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3339 return make_float16(
3340 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3343 /*----------------------------------------------------------------------------
3344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3345 | and significand `zSig', and returns the proper half-precision floating-
3346 | point value corresponding to the abstract input. Ordinarily, the abstract
3347 | value is simply rounded and packed into the half-precision format, with
3348 | the inexact exception raised if the abstract input cannot be represented
3349 | exactly. However, if the abstract value is too large, the overflow and
3350 | inexact exceptions are raised and an infinity or maximal finite value is
3351 | returned. If the abstract value is too small, the input value is rounded to
3352 | a subnormal number, and the underflow and inexact exceptions are raised if
3353 | the abstract input cannot be represented exactly as a subnormal half-
3354 | precision floating-point number.
3355 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3356 | ARM-style "alternative representation", which omits the NaN and Inf
3357 | encodings in order to raise the maximum representable exponent by one.
3358 | The input significand `zSig' has its binary point between bits 22
3359 | and 23, which is 13 bits to the left of the usual location. This shifted
3360 | significand must be normalized or smaller. If `zSig' is not normalized,
3361 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3362 | and it must not require rounding. In the usual case that `zSig' is
3363 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3364 | Note the slightly odd position of the binary point in zSig compared with the
3365 | other roundAndPackFloat functions. This should probably be fixed if we
3366 | need to implement more float16 routines than just conversion.
3367 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3368 | Binary Floating-Point Arithmetic.
3369 *----------------------------------------------------------------------------*/
3371 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3372 uint32_t zSig, flag ieee,
3373 float_status *status)
3375 int maxexp = ieee ? 29 : 30;
3376 uint32_t mask;
3377 uint32_t increment;
3378 bool rounding_bumps_exp;
3379 bool is_tiny = false;
3381 /* Calculate the mask of bits of the mantissa which are not
3382 * representable in half-precision and will be lost.
3384 if (zExp < 1) {
3385 /* Will be denormal in halfprec */
3386 mask = 0x00ffffff;
3387 if (zExp >= -11) {
3388 mask >>= 11 + zExp;
3390 } else {
3391 /* Normal number in halfprec */
3392 mask = 0x00001fff;
3395 switch (status->float_rounding_mode) {
3396 case float_round_nearest_even:
3397 increment = (mask + 1) >> 1;
3398 if ((zSig & mask) == increment) {
3399 increment = zSig & (increment << 1);
3401 break;
3402 case float_round_ties_away:
3403 increment = (mask + 1) >> 1;
3404 break;
3405 case float_round_up:
3406 increment = zSign ? 0 : mask;
3407 break;
3408 case float_round_down:
3409 increment = zSign ? mask : 0;
3410 break;
3411 default: /* round_to_zero */
3412 increment = 0;
3413 break;
3416 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3418 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3419 if (ieee) {
3420 float_raise(float_flag_overflow | float_flag_inexact, status);
3421 return packFloat16(zSign, 0x1f, 0);
3422 } else {
3423 float_raise(float_flag_invalid, status);
3424 return packFloat16(zSign, 0x1f, 0x3ff);
3428 if (zExp < 0) {
3429 /* Note that flush-to-zero does not affect half-precision results */
3430 is_tiny =
3431 (status->float_detect_tininess == float_tininess_before_rounding)
3432 || (zExp < -1)
3433 || (!rounding_bumps_exp);
3435 if (zSig & mask) {
3436 float_raise(float_flag_inexact, status);
3437 if (is_tiny) {
3438 float_raise(float_flag_underflow, status);
3442 zSig += increment;
3443 if (rounding_bumps_exp) {
3444 zSig >>= 1;
3445 zExp++;
3448 if (zExp < -10) {
3449 return packFloat16(zSign, 0, 0);
3451 if (zExp < 0) {
3452 zSig >>= -zExp;
3453 zExp = 0;
3455 return packFloat16(zSign, zExp, zSig >> 13);
3458 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3459 uint32_t *zSigPtr)
3461 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3462 *zSigPtr = aSig << shiftCount;
3463 *zExpPtr = 1 - shiftCount;
3466 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3467 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3469 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3471 flag aSign;
3472 int_fast16_t aExp;
3473 uint32_t aSig;
3475 aSign = extractFloat16Sign(a);
3476 aExp = extractFloat16Exp(a);
3477 aSig = extractFloat16Frac(a);
3479 if (aExp == 0x1f && ieee) {
3480 if (aSig) {
3481 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3483 return packFloat32(aSign, 0xff, 0);
3485 if (aExp == 0) {
3486 if (aSig == 0) {
3487 return packFloat32(aSign, 0, 0);
3490 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3491 aExp--;
3493 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3496 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3498 flag aSign;
3499 int_fast16_t aExp;
3500 uint32_t aSig;
3502 a = float32_squash_input_denormal(a, status);
3504 aSig = extractFloat32Frac( a );
3505 aExp = extractFloat32Exp( a );
3506 aSign = extractFloat32Sign( a );
3507 if ( aExp == 0xFF ) {
3508 if (aSig) {
3509 /* Input is a NaN */
3510 if (!ieee) {
3511 float_raise(float_flag_invalid, status);
3512 return packFloat16(aSign, 0, 0);
3514 return commonNaNToFloat16(
3515 float32ToCommonNaN(a, status), status);
3517 /* Infinity */
3518 if (!ieee) {
3519 float_raise(float_flag_invalid, status);
3520 return packFloat16(aSign, 0x1f, 0x3ff);
3522 return packFloat16(aSign, 0x1f, 0);
3524 if (aExp == 0 && aSig == 0) {
3525 return packFloat16(aSign, 0, 0);
3527 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3528 * even if the input is denormal; however this is harmless because
3529 * the largest possible single-precision denormal is still smaller
3530 * than the smallest representable half-precision denormal, and so we
3531 * will end up ignoring aSig and returning via the "always return zero"
3532 * codepath.
3534 aSig |= 0x00800000;
3535 aExp -= 0x71;
3537 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3540 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3542 flag aSign;
3543 int_fast16_t aExp;
3544 uint32_t aSig;
3546 aSign = extractFloat16Sign(a);
3547 aExp = extractFloat16Exp(a);
3548 aSig = extractFloat16Frac(a);
3550 if (aExp == 0x1f && ieee) {
3551 if (aSig) {
3552 return commonNaNToFloat64(
3553 float16ToCommonNaN(a, status), status);
3555 return packFloat64(aSign, 0x7ff, 0);
3557 if (aExp == 0) {
3558 if (aSig == 0) {
3559 return packFloat64(aSign, 0, 0);
3562 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3563 aExp--;
3565 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3568 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3570 flag aSign;
3571 int_fast16_t aExp;
3572 uint64_t aSig;
3573 uint32_t zSig;
3575 a = float64_squash_input_denormal(a, status);
3577 aSig = extractFloat64Frac(a);
3578 aExp = extractFloat64Exp(a);
3579 aSign = extractFloat64Sign(a);
3580 if (aExp == 0x7FF) {
3581 if (aSig) {
3582 /* Input is a NaN */
3583 if (!ieee) {
3584 float_raise(float_flag_invalid, status);
3585 return packFloat16(aSign, 0, 0);
3587 return commonNaNToFloat16(
3588 float64ToCommonNaN(a, status), status);
3590 /* Infinity */
3591 if (!ieee) {
3592 float_raise(float_flag_invalid, status);
3593 return packFloat16(aSign, 0x1f, 0x3ff);
3595 return packFloat16(aSign, 0x1f, 0);
3597 shift64RightJamming(aSig, 29, &aSig);
3598 zSig = aSig;
3599 if (aExp == 0 && zSig == 0) {
3600 return packFloat16(aSign, 0, 0);
3602 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3603 * even if the input is denormal; however this is harmless because
3604 * the largest possible single-precision denormal is still smaller
3605 * than the smallest representable half-precision denormal, and so we
3606 * will end up ignoring aSig and returning via the "always return zero"
3607 * codepath.
3609 zSig |= 0x00800000;
3610 aExp -= 0x3F1;
3612 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3615 /*----------------------------------------------------------------------------
3616 | Returns the result of converting the double-precision floating-point value
3617 | `a' to the extended double-precision floating-point format. The conversion
3618 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3619 | Arithmetic.
3620 *----------------------------------------------------------------------------*/
3622 floatx80 float64_to_floatx80(float64 a, float_status *status)
3624 flag aSign;
3625 int_fast16_t aExp;
3626 uint64_t aSig;
3628 a = float64_squash_input_denormal(a, status);
3629 aSig = extractFloat64Frac( a );
3630 aExp = extractFloat64Exp( a );
3631 aSign = extractFloat64Sign( a );
3632 if ( aExp == 0x7FF ) {
3633 if (aSig) {
3634 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3636 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3638 if ( aExp == 0 ) {
3639 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3640 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3642 return
3643 packFloatx80(
3644 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the quadruple-precision floating-point format. The conversion is
3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3655 float128 float64_to_float128(float64 a, float_status *status)
3657 flag aSign;
3658 int_fast16_t aExp;
3659 uint64_t aSig, zSig0, zSig1;
3661 a = float64_squash_input_denormal(a, status);
3662 aSig = extractFloat64Frac( a );
3663 aExp = extractFloat64Exp( a );
3664 aSign = extractFloat64Sign( a );
3665 if ( aExp == 0x7FF ) {
3666 if (aSig) {
3667 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3669 return packFloat128( aSign, 0x7FFF, 0, 0 );
3671 if ( aExp == 0 ) {
3672 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674 --aExp;
3676 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3677 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3681 /*----------------------------------------------------------------------------
3682 | Rounds the double-precision floating-point value `a' to an integer, and
3683 | returns the result as a double-precision floating-point value. The
3684 | operation is performed according to the IEC/IEEE Standard for Binary
3685 | Floating-Point Arithmetic.
3686 *----------------------------------------------------------------------------*/
3688 float64 float64_round_to_int(float64 a, float_status *status)
3690 flag aSign;
3691 int_fast16_t aExp;
3692 uint64_t lastBitMask, roundBitsMask;
3693 uint64_t z;
3694 a = float64_squash_input_denormal(a, status);
3696 aExp = extractFloat64Exp( a );
3697 if ( 0x433 <= aExp ) {
3698 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3699 return propagateFloat64NaN(a, a, status);
3701 return a;
3703 if ( aExp < 0x3FF ) {
3704 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3705 status->float_exception_flags |= float_flag_inexact;
3706 aSign = extractFloat64Sign( a );
3707 switch (status->float_rounding_mode) {
3708 case float_round_nearest_even:
3709 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3710 return packFloat64( aSign, 0x3FF, 0 );
3712 break;
3713 case float_round_ties_away:
3714 if (aExp == 0x3FE) {
3715 return packFloat64(aSign, 0x3ff, 0);
3717 break;
3718 case float_round_down:
3719 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3720 case float_round_up:
3721 return make_float64(
3722 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3724 return packFloat64( aSign, 0, 0 );
3726 lastBitMask = 1;
3727 lastBitMask <<= 0x433 - aExp;
3728 roundBitsMask = lastBitMask - 1;
3729 z = float64_val(a);
3730 switch (status->float_rounding_mode) {
3731 case float_round_nearest_even:
3732 z += lastBitMask >> 1;
3733 if ((z & roundBitsMask) == 0) {
3734 z &= ~lastBitMask;
3736 break;
3737 case float_round_ties_away:
3738 z += lastBitMask >> 1;
3739 break;
3740 case float_round_to_zero:
3741 break;
3742 case float_round_up:
3743 if (!extractFloat64Sign(make_float64(z))) {
3744 z += roundBitsMask;
3746 break;
3747 case float_round_down:
3748 if (extractFloat64Sign(make_float64(z))) {
3749 z += roundBitsMask;
3751 break;
3752 default:
3753 abort();
3755 z &= ~ roundBitsMask;
3756 if (z != float64_val(a)) {
3757 status->float_exception_flags |= float_flag_inexact;
3759 return make_float64(z);
3763 float64 float64_trunc_to_int(float64 a, float_status *status)
3765 int oldmode;
3766 float64 res;
3767 oldmode = status->float_rounding_mode;
3768 status->float_rounding_mode = float_round_to_zero;
3769 res = float64_round_to_int(a, status);
3770 status->float_rounding_mode = oldmode;
3771 return res;
3774 /*----------------------------------------------------------------------------
3775 | Returns the result of adding the absolute values of the double-precision
3776 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3777 | before being returned. `zSign' is ignored if the result is a NaN.
3778 | The addition is performed according to the IEC/IEEE Standard for Binary
3779 | Floating-Point Arithmetic.
3780 *----------------------------------------------------------------------------*/
3782 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3783 float_status *status)
3785 int_fast16_t aExp, bExp, zExp;
3786 uint64_t aSig, bSig, zSig;
3787 int_fast16_t expDiff;
3789 aSig = extractFloat64Frac( a );
3790 aExp = extractFloat64Exp( a );
3791 bSig = extractFloat64Frac( b );
3792 bExp = extractFloat64Exp( b );
3793 expDiff = aExp - bExp;
3794 aSig <<= 9;
3795 bSig <<= 9;
3796 if ( 0 < expDiff ) {
3797 if ( aExp == 0x7FF ) {
3798 if (aSig) {
3799 return propagateFloat64NaN(a, b, status);
3801 return a;
3803 if ( bExp == 0 ) {
3804 --expDiff;
3806 else {
3807 bSig |= LIT64( 0x2000000000000000 );
3809 shift64RightJamming( bSig, expDiff, &bSig );
3810 zExp = aExp;
3812 else if ( expDiff < 0 ) {
3813 if ( bExp == 0x7FF ) {
3814 if (bSig) {
3815 return propagateFloat64NaN(a, b, status);
3817 return packFloat64( zSign, 0x7FF, 0 );
3819 if ( aExp == 0 ) {
3820 ++expDiff;
3822 else {
3823 aSig |= LIT64( 0x2000000000000000 );
3825 shift64RightJamming( aSig, - expDiff, &aSig );
3826 zExp = bExp;
3828 else {
3829 if ( aExp == 0x7FF ) {
3830 if (aSig | bSig) {
3831 return propagateFloat64NaN(a, b, status);
3833 return a;
3835 if ( aExp == 0 ) {
3836 if (status->flush_to_zero) {
3837 if (aSig | bSig) {
3838 float_raise(float_flag_output_denormal, status);
3840 return packFloat64(zSign, 0, 0);
3842 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3844 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3845 zExp = aExp;
3846 goto roundAndPack;
3848 aSig |= LIT64( 0x2000000000000000 );
3849 zSig = ( aSig + bSig )<<1;
3850 --zExp;
3851 if ( (int64_t) zSig < 0 ) {
3852 zSig = aSig + bSig;
3853 ++zExp;
3855 roundAndPack:
3856 return roundAndPackFloat64(zSign, zExp, zSig, status);
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the absolute values of the double-
3862 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3863 | difference is negated before being returned. `zSign' is ignored if the
3864 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3865 | Standard for Binary Floating-Point Arithmetic.
3866 *----------------------------------------------------------------------------*/
3868 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3869 float_status *status)
3871 int_fast16_t aExp, bExp, zExp;
3872 uint64_t aSig, bSig, zSig;
3873 int_fast16_t expDiff;
3875 aSig = extractFloat64Frac( a );
3876 aExp = extractFloat64Exp( a );
3877 bSig = extractFloat64Frac( b );
3878 bExp = extractFloat64Exp( b );
3879 expDiff = aExp - bExp;
3880 aSig <<= 10;
3881 bSig <<= 10;
3882 if ( 0 < expDiff ) goto aExpBigger;
3883 if ( expDiff < 0 ) goto bExpBigger;
3884 if ( aExp == 0x7FF ) {
3885 if (aSig | bSig) {
3886 return propagateFloat64NaN(a, b, status);
3888 float_raise(float_flag_invalid, status);
3889 return float64_default_nan;
3891 if ( aExp == 0 ) {
3892 aExp = 1;
3893 bExp = 1;
3895 if ( bSig < aSig ) goto aBigger;
3896 if ( aSig < bSig ) goto bBigger;
3897 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3898 bExpBigger:
3899 if ( bExp == 0x7FF ) {
3900 if (bSig) {
3901 return propagateFloat64NaN(a, b, status);
3903 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3905 if ( aExp == 0 ) {
3906 ++expDiff;
3908 else {
3909 aSig |= LIT64( 0x4000000000000000 );
3911 shift64RightJamming( aSig, - expDiff, &aSig );
3912 bSig |= LIT64( 0x4000000000000000 );
3913 bBigger:
3914 zSig = bSig - aSig;
3915 zExp = bExp;
3916 zSign ^= 1;
3917 goto normalizeRoundAndPack;
3918 aExpBigger:
3919 if ( aExp == 0x7FF ) {
3920 if (aSig) {
3921 return propagateFloat64NaN(a, b, status);
3923 return a;
3925 if ( bExp == 0 ) {
3926 --expDiff;
3928 else {
3929 bSig |= LIT64( 0x4000000000000000 );
3931 shift64RightJamming( bSig, expDiff, &bSig );
3932 aSig |= LIT64( 0x4000000000000000 );
3933 aBigger:
3934 zSig = aSig - bSig;
3935 zExp = aExp;
3936 normalizeRoundAndPack:
3937 --zExp;
3938 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3942 /*----------------------------------------------------------------------------
3943 | Returns the result of adding the double-precision floating-point values `a'
3944 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3945 | Binary Floating-Point Arithmetic.
3946 *----------------------------------------------------------------------------*/
3948 float64 float64_add(float64 a, float64 b, float_status *status)
3950 flag aSign, bSign;
3951 a = float64_squash_input_denormal(a, status);
3952 b = float64_squash_input_denormal(b, status);
3954 aSign = extractFloat64Sign( a );
3955 bSign = extractFloat64Sign( b );
3956 if ( aSign == bSign ) {
3957 return addFloat64Sigs(a, b, aSign, status);
3959 else {
3960 return subFloat64Sigs(a, b, aSign, status);
3965 /*----------------------------------------------------------------------------
3966 | Returns the result of subtracting the double-precision floating-point values
3967 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3968 | for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3971 float64 float64_sub(float64 a, float64 b, float_status *status)
3973 flag aSign, bSign;
3974 a = float64_squash_input_denormal(a, status);
3975 b = float64_squash_input_denormal(b, status);
3977 aSign = extractFloat64Sign( a );
3978 bSign = extractFloat64Sign( b );
3979 if ( aSign == bSign ) {
3980 return subFloat64Sigs(a, b, aSign, status);
3982 else {
3983 return addFloat64Sigs(a, b, aSign, status);
3988 /*----------------------------------------------------------------------------
3989 | Returns the result of multiplying the double-precision floating-point values
3990 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3991 | for Binary Floating-Point Arithmetic.
3992 *----------------------------------------------------------------------------*/
3994 float64 float64_mul(float64 a, float64 b, float_status *status)
3996 flag aSign, bSign, zSign;
3997 int_fast16_t aExp, bExp, zExp;
3998 uint64_t aSig, bSig, zSig0, zSig1;
4000 a = float64_squash_input_denormal(a, status);
4001 b = float64_squash_input_denormal(b, status);
4003 aSig = extractFloat64Frac( a );
4004 aExp = extractFloat64Exp( a );
4005 aSign = extractFloat64Sign( a );
4006 bSig = extractFloat64Frac( b );
4007 bExp = extractFloat64Exp( b );
4008 bSign = extractFloat64Sign( b );
4009 zSign = aSign ^ bSign;
4010 if ( aExp == 0x7FF ) {
4011 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4012 return propagateFloat64NaN(a, b, status);
4014 if ( ( bExp | bSig ) == 0 ) {
4015 float_raise(float_flag_invalid, status);
4016 return float64_default_nan;
4018 return packFloat64( zSign, 0x7FF, 0 );
4020 if ( bExp == 0x7FF ) {
4021 if (bSig) {
4022 return propagateFloat64NaN(a, b, status);
4024 if ( ( aExp | aSig ) == 0 ) {
4025 float_raise(float_flag_invalid, status);
4026 return float64_default_nan;
4028 return packFloat64( zSign, 0x7FF, 0 );
4030 if ( aExp == 0 ) {
4031 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4032 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4034 if ( bExp == 0 ) {
4035 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4036 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4038 zExp = aExp + bExp - 0x3FF;
4039 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4040 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4041 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4042 zSig0 |= ( zSig1 != 0 );
4043 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4044 zSig0 <<= 1;
4045 --zExp;
4047 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4051 /*----------------------------------------------------------------------------
4052 | Returns the result of dividing the double-precision floating-point value `a'
4053 | by the corresponding value `b'. The operation is performed according to
4054 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4055 *----------------------------------------------------------------------------*/
4057 float64 float64_div(float64 a, float64 b, float_status *status)
4059 flag aSign, bSign, zSign;
4060 int_fast16_t aExp, bExp, zExp;
4061 uint64_t aSig, bSig, zSig;
4062 uint64_t rem0, rem1;
4063 uint64_t term0, term1;
4064 a = float64_squash_input_denormal(a, status);
4065 b = float64_squash_input_denormal(b, status);
4067 aSig = extractFloat64Frac( a );
4068 aExp = extractFloat64Exp( a );
4069 aSign = extractFloat64Sign( a );
4070 bSig = extractFloat64Frac( b );
4071 bExp = extractFloat64Exp( b );
4072 bSign = extractFloat64Sign( b );
4073 zSign = aSign ^ bSign;
4074 if ( aExp == 0x7FF ) {
4075 if (aSig) {
4076 return propagateFloat64NaN(a, b, status);
4078 if ( bExp == 0x7FF ) {
4079 if (bSig) {
4080 return propagateFloat64NaN(a, b, status);
4082 float_raise(float_flag_invalid, status);
4083 return float64_default_nan;
4085 return packFloat64( zSign, 0x7FF, 0 );
4087 if ( bExp == 0x7FF ) {
4088 if (bSig) {
4089 return propagateFloat64NaN(a, b, status);
4091 return packFloat64( zSign, 0, 0 );
4093 if ( bExp == 0 ) {
4094 if ( bSig == 0 ) {
4095 if ( ( aExp | aSig ) == 0 ) {
4096 float_raise(float_flag_invalid, status);
4097 return float64_default_nan;
4099 float_raise(float_flag_divbyzero, status);
4100 return packFloat64( zSign, 0x7FF, 0 );
4102 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4104 if ( aExp == 0 ) {
4105 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4106 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4108 zExp = aExp - bExp + 0x3FD;
4109 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4110 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4111 if ( bSig <= ( aSig + aSig ) ) {
4112 aSig >>= 1;
4113 ++zExp;
4115 zSig = estimateDiv128To64( aSig, 0, bSig );
4116 if ( ( zSig & 0x1FF ) <= 2 ) {
4117 mul64To128( bSig, zSig, &term0, &term1 );
4118 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4119 while ( (int64_t) rem0 < 0 ) {
4120 --zSig;
4121 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4123 zSig |= ( rem1 != 0 );
4125 return roundAndPackFloat64(zSign, zExp, zSig, status);
4129 /*----------------------------------------------------------------------------
4130 | Returns the remainder of the double-precision floating-point value `a'
4131 | with respect to the corresponding value `b'. The operation is performed
4132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4133 *----------------------------------------------------------------------------*/
4135 float64 float64_rem(float64 a, float64 b, float_status *status)
4137 flag aSign, zSign;
4138 int_fast16_t aExp, bExp, expDiff;
4139 uint64_t aSig, bSig;
4140 uint64_t q, alternateASig;
4141 int64_t sigMean;
4143 a = float64_squash_input_denormal(a, status);
4144 b = float64_squash_input_denormal(b, status);
4145 aSig = extractFloat64Frac( a );
4146 aExp = extractFloat64Exp( a );
4147 aSign = extractFloat64Sign( a );
4148 bSig = extractFloat64Frac( b );
4149 bExp = extractFloat64Exp( b );
4150 if ( aExp == 0x7FF ) {
4151 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4152 return propagateFloat64NaN(a, b, status);
4154 float_raise(float_flag_invalid, status);
4155 return float64_default_nan;
4157 if ( bExp == 0x7FF ) {
4158 if (bSig) {
4159 return propagateFloat64NaN(a, b, status);
4161 return a;
4163 if ( bExp == 0 ) {
4164 if ( bSig == 0 ) {
4165 float_raise(float_flag_invalid, status);
4166 return float64_default_nan;
4168 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4170 if ( aExp == 0 ) {
4171 if ( aSig == 0 ) return a;
4172 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4174 expDiff = aExp - bExp;
4175 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4176 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4177 if ( expDiff < 0 ) {
4178 if ( expDiff < -1 ) return a;
4179 aSig >>= 1;
4181 q = ( bSig <= aSig );
4182 if ( q ) aSig -= bSig;
4183 expDiff -= 64;
4184 while ( 0 < expDiff ) {
4185 q = estimateDiv128To64( aSig, 0, bSig );
4186 q = ( 2 < q ) ? q - 2 : 0;
4187 aSig = - ( ( bSig>>2 ) * q );
4188 expDiff -= 62;
4190 expDiff += 64;
4191 if ( 0 < expDiff ) {
4192 q = estimateDiv128To64( aSig, 0, bSig );
4193 q = ( 2 < q ) ? q - 2 : 0;
4194 q >>= 64 - expDiff;
4195 bSig >>= 2;
4196 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4198 else {
4199 aSig >>= 2;
4200 bSig >>= 2;
4202 do {
4203 alternateASig = aSig;
4204 ++q;
4205 aSig -= bSig;
4206 } while ( 0 <= (int64_t) aSig );
4207 sigMean = aSig + alternateASig;
4208 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4209 aSig = alternateASig;
4211 zSign = ( (int64_t) aSig < 0 );
4212 if ( zSign ) aSig = - aSig;
4213 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4217 /*----------------------------------------------------------------------------
4218 | Returns the result of multiplying the double-precision floating-point values
4219 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4220 | multiplication. The operation is performed according to the IEC/IEEE
4221 | Standard for Binary Floating-Point Arithmetic 754-2008.
4222 | The flags argument allows the caller to select negation of the
4223 | addend, the intermediate product, or the final result. (The difference
4224 | between this and having the caller do a separate negation is that negating
4225 | externally will flip the sign bit on NaNs.)
4226 *----------------------------------------------------------------------------*/
4228 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4229 float_status *status)
4231 flag aSign, bSign, cSign, zSign;
4232 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4233 uint64_t aSig, bSig, cSig;
4234 flag pInf, pZero, pSign;
4235 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4236 int shiftcount;
4237 flag signflip, infzero;
4239 a = float64_squash_input_denormal(a, status);
4240 b = float64_squash_input_denormal(b, status);
4241 c = float64_squash_input_denormal(c, status);
4242 aSig = extractFloat64Frac(a);
4243 aExp = extractFloat64Exp(a);
4244 aSign = extractFloat64Sign(a);
4245 bSig = extractFloat64Frac(b);
4246 bExp = extractFloat64Exp(b);
4247 bSign = extractFloat64Sign(b);
4248 cSig = extractFloat64Frac(c);
4249 cExp = extractFloat64Exp(c);
4250 cSign = extractFloat64Sign(c);
4252 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4253 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4255 /* It is implementation-defined whether the cases of (0,inf,qnan)
4256 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4257 * they return if they do), so we have to hand this information
4258 * off to the target-specific pick-a-NaN routine.
4260 if (((aExp == 0x7ff) && aSig) ||
4261 ((bExp == 0x7ff) && bSig) ||
4262 ((cExp == 0x7ff) && cSig)) {
4263 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4266 if (infzero) {
4267 float_raise(float_flag_invalid, status);
4268 return float64_default_nan;
4271 if (flags & float_muladd_negate_c) {
4272 cSign ^= 1;
4275 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4277 /* Work out the sign and type of the product */
4278 pSign = aSign ^ bSign;
4279 if (flags & float_muladd_negate_product) {
4280 pSign ^= 1;
4282 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4283 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4285 if (cExp == 0x7ff) {
4286 if (pInf && (pSign ^ cSign)) {
4287 /* addition of opposite-signed infinities => InvalidOperation */
4288 float_raise(float_flag_invalid, status);
4289 return float64_default_nan;
4291 /* Otherwise generate an infinity of the same sign */
4292 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4295 if (pInf) {
4296 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4299 if (pZero) {
4300 if (cExp == 0) {
4301 if (cSig == 0) {
4302 /* Adding two exact zeroes */
4303 if (pSign == cSign) {
4304 zSign = pSign;
4305 } else if (status->float_rounding_mode == float_round_down) {
4306 zSign = 1;
4307 } else {
4308 zSign = 0;
4310 return packFloat64(zSign ^ signflip, 0, 0);
4312 /* Exact zero plus a denorm */
4313 if (status->flush_to_zero) {
4314 float_raise(float_flag_output_denormal, status);
4315 return packFloat64(cSign ^ signflip, 0, 0);
4318 /* Zero plus something non-zero : just return the something */
4319 if (flags & float_muladd_halve_result) {
4320 if (cExp == 0) {
4321 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4323 /* Subtract one to halve, and one again because roundAndPackFloat64
4324 * wants one less than the true exponent.
4326 cExp -= 2;
4327 cSig = (cSig | 0x0010000000000000ULL) << 10;
4328 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4330 return packFloat64(cSign ^ signflip, cExp, cSig);
4333 if (aExp == 0) {
4334 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4336 if (bExp == 0) {
4337 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4340 /* Calculate the actual result a * b + c */
4342 /* Multiply first; this is easy. */
4343 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4344 * because we want the true exponent, not the "one-less-than"
4345 * flavour that roundAndPackFloat64() takes.
4347 pExp = aExp + bExp - 0x3fe;
4348 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4349 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4350 mul64To128(aSig, bSig, &pSig0, &pSig1);
4351 if ((int64_t)(pSig0 << 1) >= 0) {
4352 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4353 pExp--;
4356 zSign = pSign ^ signflip;
4358 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4359 * bit in position 126.
4361 if (cExp == 0) {
4362 if (!cSig) {
4363 /* Throw out the special case of c being an exact zero now */
4364 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4365 if (flags & float_muladd_halve_result) {
4366 pExp--;
4368 return roundAndPackFloat64(zSign, pExp - 1,
4369 pSig1, status);
4371 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4374 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4375 * significand of the addend, with the explicit bit in position 126.
4377 cSig0 = cSig << (126 - 64 - 52);
4378 cSig1 = 0;
4379 cSig0 |= LIT64(0x4000000000000000);
4380 expDiff = pExp - cExp;
4382 if (pSign == cSign) {
4383 /* Addition */
4384 if (expDiff > 0) {
4385 /* scale c to match p */
4386 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4387 zExp = pExp;
4388 } else if (expDiff < 0) {
4389 /* scale p to match c */
4390 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4391 zExp = cExp;
4392 } else {
4393 /* no scaling needed */
4394 zExp = cExp;
4396 /* Add significands and make sure explicit bit ends up in posn 126 */
4397 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4398 if ((int64_t)zSig0 < 0) {
4399 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4400 } else {
4401 zExp--;
4403 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4404 if (flags & float_muladd_halve_result) {
4405 zExp--;
4407 return roundAndPackFloat64(zSign, zExp, zSig1, status);
4408 } else {
4409 /* Subtraction */
4410 if (expDiff > 0) {
4411 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4412 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4413 zExp = pExp;
4414 } else if (expDiff < 0) {
4415 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4416 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4417 zExp = cExp;
4418 zSign ^= 1;
4419 } else {
4420 zExp = pExp;
4421 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4422 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4424 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4425 zSign ^= 1;
4426 } else {
4427 /* Exact zero */
4428 zSign = signflip;
4429 if (status->float_rounding_mode == float_round_down) {
4430 zSign ^= 1;
4432 return packFloat64(zSign, 0, 0);
4435 --zExp;
4436 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4437 * starting with the significand in a pair of uint64_t.
4439 if (zSig0) {
4440 shiftcount = countLeadingZeros64(zSig0) - 1;
4441 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4442 if (zSig1) {
4443 zSig0 |= 1;
4445 zExp -= shiftcount;
4446 } else {
4447 shiftcount = countLeadingZeros64(zSig1);
4448 if (shiftcount == 0) {
4449 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4450 zExp -= 63;
4451 } else {
4452 shiftcount--;
4453 zSig0 = zSig1 << shiftcount;
4454 zExp -= (shiftcount + 64);
4457 if (flags & float_muladd_halve_result) {
4458 zExp--;
4460 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4464 /*----------------------------------------------------------------------------
4465 | Returns the square root of the double-precision floating-point value `a'.
4466 | The operation is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic.
4468 *----------------------------------------------------------------------------*/
4470 float64 float64_sqrt(float64 a, float_status *status)
4472 flag aSign;
4473 int_fast16_t aExp, zExp;
4474 uint64_t aSig, zSig, doubleZSig;
4475 uint64_t rem0, rem1, term0, term1;
4476 a = float64_squash_input_denormal(a, status);
4478 aSig = extractFloat64Frac( a );
4479 aExp = extractFloat64Exp( a );
4480 aSign = extractFloat64Sign( a );
4481 if ( aExp == 0x7FF ) {
4482 if (aSig) {
4483 return propagateFloat64NaN(a, a, status);
4485 if ( ! aSign ) return a;
4486 float_raise(float_flag_invalid, status);
4487 return float64_default_nan;
4489 if ( aSign ) {
4490 if ( ( aExp | aSig ) == 0 ) return a;
4491 float_raise(float_flag_invalid, status);
4492 return float64_default_nan;
4494 if ( aExp == 0 ) {
4495 if ( aSig == 0 ) return float64_zero;
4496 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4498 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4499 aSig |= LIT64( 0x0010000000000000 );
4500 zSig = estimateSqrt32( aExp, aSig>>21 );
4501 aSig <<= 9 - ( aExp & 1 );
4502 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4503 if ( ( zSig & 0x1FF ) <= 5 ) {
4504 doubleZSig = zSig<<1;
4505 mul64To128( zSig, zSig, &term0, &term1 );
4506 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4507 while ( (int64_t) rem0 < 0 ) {
4508 --zSig;
4509 doubleZSig -= 2;
4510 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4512 zSig |= ( ( rem0 | rem1 ) != 0 );
4514 return roundAndPackFloat64(0, zExp, zSig, status);
4518 /*----------------------------------------------------------------------------
4519 | Returns the binary log of the double-precision floating-point value `a'.
4520 | The operation is performed according to the IEC/IEEE Standard for Binary
4521 | Floating-Point Arithmetic.
4522 *----------------------------------------------------------------------------*/
4523 float64 float64_log2(float64 a, float_status *status)
4525 flag aSign, zSign;
4526 int_fast16_t aExp;
4527 uint64_t aSig, aSig0, aSig1, zSig, i;
4528 a = float64_squash_input_denormal(a, status);
4530 aSig = extractFloat64Frac( a );
4531 aExp = extractFloat64Exp( a );
4532 aSign = extractFloat64Sign( a );
4534 if ( aExp == 0 ) {
4535 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4536 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4538 if ( aSign ) {
4539 float_raise(float_flag_invalid, status);
4540 return float64_default_nan;
4542 if ( aExp == 0x7FF ) {
4543 if (aSig) {
4544 return propagateFloat64NaN(a, float64_zero, status);
4546 return a;
4549 aExp -= 0x3FF;
4550 aSig |= LIT64( 0x0010000000000000 );
4551 zSign = aExp < 0;
4552 zSig = (uint64_t)aExp << 52;
4553 for (i = 1LL << 51; i > 0; i >>= 1) {
4554 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4555 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4556 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4557 aSig >>= 1;
4558 zSig |= i;
4562 if ( zSign )
4563 zSig = -zSig;
4564 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the double-precision floating-point value `a' is equal to the
4569 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4570 | if either operand is a NaN. Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4574 int float64_eq(float64 a, float64 b, float_status *status)
4576 uint64_t av, bv;
4577 a = float64_squash_input_denormal(a, status);
4578 b = float64_squash_input_denormal(b, status);
4580 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4581 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4583 float_raise(float_flag_invalid, status);
4584 return 0;
4586 av = float64_val(a);
4587 bv = float64_val(b);
4588 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4592 /*----------------------------------------------------------------------------
4593 | Returns 1 if the double-precision floating-point value `a' is less than or
4594 | equal to the corresponding value `b', and 0 otherwise. The invalid
4595 | exception is raised if either operand is a NaN. The comparison is performed
4596 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4597 *----------------------------------------------------------------------------*/
4599 int float64_le(float64 a, float64 b, float_status *status)
4601 flag aSign, bSign;
4602 uint64_t av, bv;
4603 a = float64_squash_input_denormal(a, status);
4604 b = float64_squash_input_denormal(b, status);
4606 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4609 float_raise(float_flag_invalid, status);
4610 return 0;
4612 aSign = extractFloat64Sign( a );
4613 bSign = extractFloat64Sign( b );
4614 av = float64_val(a);
4615 bv = float64_val(b);
4616 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4617 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the double-precision floating-point value `a' is less than
4623 | the corresponding value `b', and 0 otherwise. The invalid exception is
4624 | raised if either operand is a NaN. The comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 int float64_lt(float64 a, float64 b, float_status *status)
4630 flag aSign, bSign;
4631 uint64_t av, bv;
4633 a = float64_squash_input_denormal(a, status);
4634 b = float64_squash_input_denormal(b, status);
4635 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4636 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4638 float_raise(float_flag_invalid, status);
4639 return 0;
4641 aSign = extractFloat64Sign( a );
4642 bSign = extractFloat64Sign( b );
4643 av = float64_val(a);
4644 bv = float64_val(b);
4645 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4646 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4650 /*----------------------------------------------------------------------------
4651 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4652 | be compared, and 0 otherwise. The invalid exception is raised if either
4653 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4654 | Standard for Binary Floating-Point Arithmetic.
4655 *----------------------------------------------------------------------------*/
4657 int float64_unordered(float64 a, float64 b, float_status *status)
4659 a = float64_squash_input_denormal(a, status);
4660 b = float64_squash_input_denormal(b, status);
4662 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4663 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4665 float_raise(float_flag_invalid, status);
4666 return 1;
4668 return 0;
4671 /*----------------------------------------------------------------------------
4672 | Returns 1 if the double-precision floating-point value `a' is equal to the
4673 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4674 | exception.The comparison is performed according to the IEC/IEEE Standard
4675 | for Binary Floating-Point Arithmetic.
4676 *----------------------------------------------------------------------------*/
4678 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4680 uint64_t av, bv;
4681 a = float64_squash_input_denormal(a, status);
4682 b = float64_squash_input_denormal(b, status);
4684 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4685 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4687 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4688 float_raise(float_flag_invalid, status);
4690 return 0;
4692 av = float64_val(a);
4693 bv = float64_val(b);
4694 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the double-precision floating-point value `a' is less than or
4700 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4701 | cause an exception. Otherwise, the comparison is performed according to the
4702 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4703 *----------------------------------------------------------------------------*/
4705 int float64_le_quiet(float64 a, float64 b, float_status *status)
4707 flag aSign, bSign;
4708 uint64_t av, bv;
4709 a = float64_squash_input_denormal(a, status);
4710 b = float64_squash_input_denormal(b, status);
4712 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4713 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4715 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4716 float_raise(float_flag_invalid, status);
4718 return 0;
4720 aSign = extractFloat64Sign( a );
4721 bSign = extractFloat64Sign( b );
4722 av = float64_val(a);
4723 bv = float64_val(b);
4724 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4725 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4729 /*----------------------------------------------------------------------------
4730 | Returns 1 if the double-precision floating-point value `a' is less than
4731 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4732 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4733 | Standard for Binary Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4736 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4738 flag aSign, bSign;
4739 uint64_t av, bv;
4740 a = float64_squash_input_denormal(a, status);
4741 b = float64_squash_input_denormal(b, status);
4743 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4744 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4746 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4747 float_raise(float_flag_invalid, status);
4749 return 0;
4751 aSign = extractFloat64Sign( a );
4752 bSign = extractFloat64Sign( b );
4753 av = float64_val(a);
4754 bv = float64_val(b);
4755 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4756 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4760 /*----------------------------------------------------------------------------
4761 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4762 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4763 | comparison is performed according to the IEC/IEEE Standard for Binary
4764 | Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4767 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4769 a = float64_squash_input_denormal(a, status);
4770 b = float64_squash_input_denormal(b, status);
4772 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4773 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4775 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4776 float_raise(float_flag_invalid, status);
4778 return 1;
4780 return 0;
4783 /*----------------------------------------------------------------------------
4784 | Returns the result of converting the extended double-precision floating-
4785 | point value `a' to the 32-bit two's complement integer format. The
4786 | conversion is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic---which means in particular that the conversion
4788 | is rounded according to the current rounding mode. If `a' is a NaN, the
4789 | largest positive integer is returned. Otherwise, if the conversion
4790 | overflows, the largest integer with the same sign as `a' is returned.
4791 *----------------------------------------------------------------------------*/
4793 int32 floatx80_to_int32(floatx80 a, float_status *status)
4795 flag aSign;
4796 int32 aExp, shiftCount;
4797 uint64_t aSig;
4799 aSig = extractFloatx80Frac( a );
4800 aExp = extractFloatx80Exp( a );
4801 aSign = extractFloatx80Sign( a );
4802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4803 shiftCount = 0x4037 - aExp;
4804 if ( shiftCount <= 0 ) shiftCount = 1;
4805 shift64RightJamming( aSig, shiftCount, &aSig );
4806 return roundAndPackInt32(aSign, aSig, status);
4810 /*----------------------------------------------------------------------------
4811 | Returns the result of converting the extended double-precision floating-
4812 | point value `a' to the 32-bit two's complement integer format. The
4813 | conversion is performed according to the IEC/IEEE Standard for Binary
4814 | Floating-Point Arithmetic, except that the conversion is always rounded
4815 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4816 | Otherwise, if the conversion overflows, the largest integer with the same
4817 | sign as `a' is returned.
4818 *----------------------------------------------------------------------------*/
4820 int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4822 flag aSign;
4823 int32 aExp, shiftCount;
4824 uint64_t aSig, savedASig;
4825 int32_t z;
4827 aSig = extractFloatx80Frac( a );
4828 aExp = extractFloatx80Exp( a );
4829 aSign = extractFloatx80Sign( a );
4830 if ( 0x401E < aExp ) {
4831 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4832 goto invalid;
4834 else if ( aExp < 0x3FFF ) {
4835 if (aExp || aSig) {
4836 status->float_exception_flags |= float_flag_inexact;
4838 return 0;
4840 shiftCount = 0x403E - aExp;
4841 savedASig = aSig;
4842 aSig >>= shiftCount;
4843 z = aSig;
4844 if ( aSign ) z = - z;
4845 if ( ( z < 0 ) ^ aSign ) {
4846 invalid:
4847 float_raise(float_flag_invalid, status);
4848 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4850 if ( ( aSig<<shiftCount ) != savedASig ) {
4851 status->float_exception_flags |= float_flag_inexact;
4853 return z;
4857 /*----------------------------------------------------------------------------
4858 | Returns the result of converting the extended double-precision floating-
4859 | point value `a' to the 64-bit two's complement integer format. The
4860 | conversion is performed according to the IEC/IEEE Standard for Binary
4861 | Floating-Point Arithmetic---which means in particular that the conversion
4862 | is rounded according to the current rounding mode. If `a' is a NaN,
4863 | the largest positive integer is returned. Otherwise, if the conversion
4864 | overflows, the largest integer with the same sign as `a' is returned.
4865 *----------------------------------------------------------------------------*/
4867 int64 floatx80_to_int64(floatx80 a, float_status *status)
4869 flag aSign;
4870 int32 aExp, shiftCount;
4871 uint64_t aSig, aSigExtra;
4873 aSig = extractFloatx80Frac( a );
4874 aExp = extractFloatx80Exp( a );
4875 aSign = extractFloatx80Sign( a );
4876 shiftCount = 0x403E - aExp;
4877 if ( shiftCount <= 0 ) {
4878 if ( shiftCount ) {
4879 float_raise(float_flag_invalid, status);
4880 if ( ! aSign
4881 || ( ( aExp == 0x7FFF )
4882 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4884 return LIT64( 0x7FFFFFFFFFFFFFFF );
4886 return (int64_t) LIT64( 0x8000000000000000 );
4888 aSigExtra = 0;
4890 else {
4891 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4893 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of converting the extended double-precision floating-
4899 | point value `a' to the 64-bit two's complement integer format. The
4900 | conversion is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic, except that the conversion is always rounded
4902 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4903 | Otherwise, if the conversion overflows, the largest integer with the same
4904 | sign as `a' is returned.
4905 *----------------------------------------------------------------------------*/
4907 int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4909 flag aSign;
4910 int32 aExp, shiftCount;
4911 uint64_t aSig;
4912 int64 z;
4914 aSig = extractFloatx80Frac( a );
4915 aExp = extractFloatx80Exp( a );
4916 aSign = extractFloatx80Sign( a );
4917 shiftCount = aExp - 0x403E;
4918 if ( 0 <= shiftCount ) {
4919 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4920 if ( ( a.high != 0xC03E ) || aSig ) {
4921 float_raise(float_flag_invalid, status);
4922 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4923 return LIT64( 0x7FFFFFFFFFFFFFFF );
4926 return (int64_t) LIT64( 0x8000000000000000 );
4928 else if ( aExp < 0x3FFF ) {
4929 if (aExp | aSig) {
4930 status->float_exception_flags |= float_flag_inexact;
4932 return 0;
4934 z = aSig>>( - shiftCount );
4935 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4936 status->float_exception_flags |= float_flag_inexact;
4938 if ( aSign ) z = - z;
4939 return z;
4943 /*----------------------------------------------------------------------------
4944 | Returns the result of converting the extended double-precision floating-
4945 | point value `a' to the single-precision floating-point format. The
4946 | conversion is performed according to the IEC/IEEE Standard for Binary
4947 | Floating-Point Arithmetic.
4948 *----------------------------------------------------------------------------*/
4950 float32 floatx80_to_float32(floatx80 a, float_status *status)
4952 flag aSign;
4953 int32 aExp;
4954 uint64_t aSig;
4956 aSig = extractFloatx80Frac( a );
4957 aExp = extractFloatx80Exp( a );
4958 aSign = extractFloatx80Sign( a );
4959 if ( aExp == 0x7FFF ) {
4960 if ( (uint64_t) ( aSig<<1 ) ) {
4961 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4963 return packFloat32( aSign, 0xFF, 0 );
4965 shift64RightJamming( aSig, 33, &aSig );
4966 if ( aExp || aSig ) aExp -= 0x3F81;
4967 return roundAndPackFloat32(aSign, aExp, aSig, status);
4971 /*----------------------------------------------------------------------------
4972 | Returns the result of converting the extended double-precision floating-
4973 | point value `a' to the double-precision floating-point format. The
4974 | conversion is performed according to the IEC/IEEE Standard for Binary
4975 | Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4978 float64 floatx80_to_float64(floatx80 a, float_status *status)
4980 flag aSign;
4981 int32 aExp;
4982 uint64_t aSig, zSig;
4984 aSig = extractFloatx80Frac( a );
4985 aExp = extractFloatx80Exp( a );
4986 aSign = extractFloatx80Sign( a );
4987 if ( aExp == 0x7FFF ) {
4988 if ( (uint64_t) ( aSig<<1 ) ) {
4989 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4991 return packFloat64( aSign, 0x7FF, 0 );
4993 shift64RightJamming( aSig, 1, &zSig );
4994 if ( aExp || aSig ) aExp -= 0x3C01;
4995 return roundAndPackFloat64(aSign, aExp, zSig, status);
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the extended double-precision floating-
5001 | point value `a' to the quadruple-precision floating-point format. The
5002 | conversion is performed according to the IEC/IEEE Standard for Binary
5003 | Floating-Point Arithmetic.
5004 *----------------------------------------------------------------------------*/
5006 float128 floatx80_to_float128(floatx80 a, float_status *status)
5008 flag aSign;
5009 int_fast16_t aExp;
5010 uint64_t aSig, zSig0, zSig1;
5012 aSig = extractFloatx80Frac( a );
5013 aExp = extractFloatx80Exp( a );
5014 aSign = extractFloatx80Sign( a );
5015 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5016 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5018 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5019 return packFloat128( aSign, aExp, zSig0, zSig1 );
5023 /*----------------------------------------------------------------------------
5024 | Rounds the extended double-precision floating-point value `a' to an integer,
5025 | and returns the result as an extended quadruple-precision floating-point
5026 | value. The operation is performed according to the IEC/IEEE Standard for
5027 | Binary Floating-Point Arithmetic.
5028 *----------------------------------------------------------------------------*/
5030 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5032 flag aSign;
5033 int32 aExp;
5034 uint64_t lastBitMask, roundBitsMask;
5035 floatx80 z;
5037 aExp = extractFloatx80Exp( a );
5038 if ( 0x403E <= aExp ) {
5039 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5040 return propagateFloatx80NaN(a, a, status);
5042 return a;
5044 if ( aExp < 0x3FFF ) {
5045 if ( ( aExp == 0 )
5046 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5047 return a;
5049 status->float_exception_flags |= float_flag_inexact;
5050 aSign = extractFloatx80Sign( a );
5051 switch (status->float_rounding_mode) {
5052 case float_round_nearest_even:
5053 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5055 return
5056 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5058 break;
5059 case float_round_ties_away:
5060 if (aExp == 0x3FFE) {
5061 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5063 break;
5064 case float_round_down:
5065 return
5066 aSign ?
5067 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5068 : packFloatx80( 0, 0, 0 );
5069 case float_round_up:
5070 return
5071 aSign ? packFloatx80( 1, 0, 0 )
5072 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5074 return packFloatx80( aSign, 0, 0 );
5076 lastBitMask = 1;
5077 lastBitMask <<= 0x403E - aExp;
5078 roundBitsMask = lastBitMask - 1;
5079 z = a;
5080 switch (status->float_rounding_mode) {
5081 case float_round_nearest_even:
5082 z.low += lastBitMask>>1;
5083 if ((z.low & roundBitsMask) == 0) {
5084 z.low &= ~lastBitMask;
5086 break;
5087 case float_round_ties_away:
5088 z.low += lastBitMask >> 1;
5089 break;
5090 case float_round_to_zero:
5091 break;
5092 case float_round_up:
5093 if (!extractFloatx80Sign(z)) {
5094 z.low += roundBitsMask;
5096 break;
5097 case float_round_down:
5098 if (extractFloatx80Sign(z)) {
5099 z.low += roundBitsMask;
5101 break;
5102 default:
5103 abort();
5105 z.low &= ~ roundBitsMask;
5106 if ( z.low == 0 ) {
5107 ++z.high;
5108 z.low = LIT64( 0x8000000000000000 );
5110 if (z.low != a.low) {
5111 status->float_exception_flags |= float_flag_inexact;
5113 return z;
5117 /*----------------------------------------------------------------------------
5118 | Returns the result of adding the absolute values of the extended double-
5119 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5120 | negated before being returned. `zSign' is ignored if the result is a NaN.
5121 | The addition is performed according to the IEC/IEEE Standard for Binary
5122 | Floating-Point Arithmetic.
5123 *----------------------------------------------------------------------------*/
5125 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5126 float_status *status)
5128 int32 aExp, bExp, zExp;
5129 uint64_t aSig, bSig, zSig0, zSig1;
5130 int32 expDiff;
5132 aSig = extractFloatx80Frac( a );
5133 aExp = extractFloatx80Exp( a );
5134 bSig = extractFloatx80Frac( b );
5135 bExp = extractFloatx80Exp( b );
5136 expDiff = aExp - bExp;
5137 if ( 0 < expDiff ) {
5138 if ( aExp == 0x7FFF ) {
5139 if ((uint64_t)(aSig << 1)) {
5140 return propagateFloatx80NaN(a, b, status);
5142 return a;
5144 if ( bExp == 0 ) --expDiff;
5145 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5146 zExp = aExp;
5148 else if ( expDiff < 0 ) {
5149 if ( bExp == 0x7FFF ) {
5150 if ((uint64_t)(bSig << 1)) {
5151 return propagateFloatx80NaN(a, b, status);
5153 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5155 if ( aExp == 0 ) ++expDiff;
5156 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5157 zExp = bExp;
5159 else {
5160 if ( aExp == 0x7FFF ) {
5161 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5162 return propagateFloatx80NaN(a, b, status);
5164 return a;
5166 zSig1 = 0;
5167 zSig0 = aSig + bSig;
5168 if ( aExp == 0 ) {
5169 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5170 goto roundAndPack;
5172 zExp = aExp;
5173 goto shiftRight1;
5175 zSig0 = aSig + bSig;
5176 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5177 shiftRight1:
5178 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5179 zSig0 |= LIT64( 0x8000000000000000 );
5180 ++zExp;
5181 roundAndPack:
5182 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183 zSign, zExp, zSig0, zSig1, status);
5186 /*----------------------------------------------------------------------------
5187 | Returns the result of subtracting the absolute values of the extended
5188 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5189 | difference is negated before being returned. `zSign' is ignored if the
5190 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5191 | Standard for Binary Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5194 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195 float_status *status)
5197 int32 aExp, bExp, zExp;
5198 uint64_t aSig, bSig, zSig0, zSig1;
5199 int32 expDiff;
5200 floatx80 z;
5202 aSig = extractFloatx80Frac( a );
5203 aExp = extractFloatx80Exp( a );
5204 bSig = extractFloatx80Frac( b );
5205 bExp = extractFloatx80Exp( b );
5206 expDiff = aExp - bExp;
5207 if ( 0 < expDiff ) goto aExpBigger;
5208 if ( expDiff < 0 ) goto bExpBigger;
5209 if ( aExp == 0x7FFF ) {
5210 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5211 return propagateFloatx80NaN(a, b, status);
5213 float_raise(float_flag_invalid, status);
5214 z.low = floatx80_default_nan_low;
5215 z.high = floatx80_default_nan_high;
5216 return z;
5218 if ( aExp == 0 ) {
5219 aExp = 1;
5220 bExp = 1;
5222 zSig1 = 0;
5223 if ( bSig < aSig ) goto aBigger;
5224 if ( aSig < bSig ) goto bBigger;
5225 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5226 bExpBigger:
5227 if ( bExp == 0x7FFF ) {
5228 if ((uint64_t)(bSig << 1)) {
5229 return propagateFloatx80NaN(a, b, status);
5231 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5233 if ( aExp == 0 ) ++expDiff;
5234 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5235 bBigger:
5236 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5237 zExp = bExp;
5238 zSign ^= 1;
5239 goto normalizeRoundAndPack;
5240 aExpBigger:
5241 if ( aExp == 0x7FFF ) {
5242 if ((uint64_t)(aSig << 1)) {
5243 return propagateFloatx80NaN(a, b, status);
5245 return a;
5247 if ( bExp == 0 ) --expDiff;
5248 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5249 aBigger:
5250 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5251 zExp = aExp;
5252 normalizeRoundAndPack:
5253 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5254 zSign, zExp, zSig0, zSig1, status);
5257 /*----------------------------------------------------------------------------
5258 | Returns the result of adding the extended double-precision floating-point
5259 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5263 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5265 flag aSign, bSign;
5267 aSign = extractFloatx80Sign( a );
5268 bSign = extractFloatx80Sign( b );
5269 if ( aSign == bSign ) {
5270 return addFloatx80Sigs(a, b, aSign, status);
5272 else {
5273 return subFloatx80Sigs(a, b, aSign, status);
5278 /*----------------------------------------------------------------------------
5279 | Returns the result of subtracting the extended double-precision floating-
5280 | point values `a' and `b'. The operation is performed according to the
5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282 *----------------------------------------------------------------------------*/
5284 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5286 flag aSign, bSign;
5288 aSign = extractFloatx80Sign( a );
5289 bSign = extractFloatx80Sign( b );
5290 if ( aSign == bSign ) {
5291 return subFloatx80Sigs(a, b, aSign, status);
5293 else {
5294 return addFloatx80Sigs(a, b, aSign, status);
5299 /*----------------------------------------------------------------------------
5300 | Returns the result of multiplying the extended double-precision floating-
5301 | point values `a' and `b'. The operation is performed according to the
5302 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 *----------------------------------------------------------------------------*/
5305 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5307 flag aSign, bSign, zSign;
5308 int32 aExp, bExp, zExp;
5309 uint64_t aSig, bSig, zSig0, zSig1;
5310 floatx80 z;
5312 aSig = extractFloatx80Frac( a );
5313 aExp = extractFloatx80Exp( a );
5314 aSign = extractFloatx80Sign( a );
5315 bSig = extractFloatx80Frac( b );
5316 bExp = extractFloatx80Exp( b );
5317 bSign = extractFloatx80Sign( b );
5318 zSign = aSign ^ bSign;
5319 if ( aExp == 0x7FFF ) {
5320 if ( (uint64_t) ( aSig<<1 )
5321 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5322 return propagateFloatx80NaN(a, b, status);
5324 if ( ( bExp | bSig ) == 0 ) goto invalid;
5325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5327 if ( bExp == 0x7FFF ) {
5328 if ((uint64_t)(bSig << 1)) {
5329 return propagateFloatx80NaN(a, b, status);
5331 if ( ( aExp | aSig ) == 0 ) {
5332 invalid:
5333 float_raise(float_flag_invalid, status);
5334 z.low = floatx80_default_nan_low;
5335 z.high = floatx80_default_nan_high;
5336 return z;
5338 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5340 if ( aExp == 0 ) {
5341 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5342 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5344 if ( bExp == 0 ) {
5345 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5346 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5348 zExp = aExp + bExp - 0x3FFE;
5349 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5350 if ( 0 < (int64_t) zSig0 ) {
5351 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5352 --zExp;
5354 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5355 zSign, zExp, zSig0, zSig1, status);
5358 /*----------------------------------------------------------------------------
5359 | Returns the result of dividing the extended double-precision floating-point
5360 | value `a' by the corresponding value `b'. The operation is performed
5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5364 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5366 flag aSign, bSign, zSign;
5367 int32 aExp, bExp, zExp;
5368 uint64_t aSig, bSig, zSig0, zSig1;
5369 uint64_t rem0, rem1, rem2, term0, term1, term2;
5370 floatx80 z;
5372 aSig = extractFloatx80Frac( a );
5373 aExp = extractFloatx80Exp( a );
5374 aSign = extractFloatx80Sign( a );
5375 bSig = extractFloatx80Frac( b );
5376 bExp = extractFloatx80Exp( b );
5377 bSign = extractFloatx80Sign( b );
5378 zSign = aSign ^ bSign;
5379 if ( aExp == 0x7FFF ) {
5380 if ((uint64_t)(aSig << 1)) {
5381 return propagateFloatx80NaN(a, b, status);
5383 if ( bExp == 0x7FFF ) {
5384 if ((uint64_t)(bSig << 1)) {
5385 return propagateFloatx80NaN(a, b, status);
5387 goto invalid;
5389 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5391 if ( bExp == 0x7FFF ) {
5392 if ((uint64_t)(bSig << 1)) {
5393 return propagateFloatx80NaN(a, b, status);
5395 return packFloatx80( zSign, 0, 0 );
5397 if ( bExp == 0 ) {
5398 if ( bSig == 0 ) {
5399 if ( ( aExp | aSig ) == 0 ) {
5400 invalid:
5401 float_raise(float_flag_invalid, status);
5402 z.low = floatx80_default_nan_low;
5403 z.high = floatx80_default_nan_high;
5404 return z;
5406 float_raise(float_flag_divbyzero, status);
5407 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5409 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5411 if ( aExp == 0 ) {
5412 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5413 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5415 zExp = aExp - bExp + 0x3FFE;
5416 rem1 = 0;
5417 if ( bSig <= aSig ) {
5418 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5419 ++zExp;
5421 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5422 mul64To128( bSig, zSig0, &term0, &term1 );
5423 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5424 while ( (int64_t) rem0 < 0 ) {
5425 --zSig0;
5426 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5428 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5429 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5430 mul64To128( bSig, zSig1, &term1, &term2 );
5431 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5432 while ( (int64_t) rem1 < 0 ) {
5433 --zSig1;
5434 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5436 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5438 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5439 zSign, zExp, zSig0, zSig1, status);
5442 /*----------------------------------------------------------------------------
5443 | Returns the remainder of the extended double-precision floating-point value
5444 | `a' with respect to the corresponding value `b'. The operation is performed
5445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5446 *----------------------------------------------------------------------------*/
5448 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5450 flag aSign, zSign;
5451 int32 aExp, bExp, expDiff;
5452 uint64_t aSig0, aSig1, bSig;
5453 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5454 floatx80 z;
5456 aSig0 = extractFloatx80Frac( a );
5457 aExp = extractFloatx80Exp( a );
5458 aSign = extractFloatx80Sign( a );
5459 bSig = extractFloatx80Frac( b );
5460 bExp = extractFloatx80Exp( b );
5461 if ( aExp == 0x7FFF ) {
5462 if ( (uint64_t) ( aSig0<<1 )
5463 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5464 return propagateFloatx80NaN(a, b, status);
5466 goto invalid;
5468 if ( bExp == 0x7FFF ) {
5469 if ((uint64_t)(bSig << 1)) {
5470 return propagateFloatx80NaN(a, b, status);
5472 return a;
5474 if ( bExp == 0 ) {
5475 if ( bSig == 0 ) {
5476 invalid:
5477 float_raise(float_flag_invalid, status);
5478 z.low = floatx80_default_nan_low;
5479 z.high = floatx80_default_nan_high;
5480 return z;
5482 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5484 if ( aExp == 0 ) {
5485 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5486 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5488 bSig |= LIT64( 0x8000000000000000 );
5489 zSign = aSign;
5490 expDiff = aExp - bExp;
5491 aSig1 = 0;
5492 if ( expDiff < 0 ) {
5493 if ( expDiff < -1 ) return a;
5494 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5495 expDiff = 0;
5497 q = ( bSig <= aSig0 );
5498 if ( q ) aSig0 -= bSig;
5499 expDiff -= 64;
5500 while ( 0 < expDiff ) {
5501 q = estimateDiv128To64( aSig0, aSig1, bSig );
5502 q = ( 2 < q ) ? q - 2 : 0;
5503 mul64To128( bSig, q, &term0, &term1 );
5504 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5505 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5506 expDiff -= 62;
5508 expDiff += 64;
5509 if ( 0 < expDiff ) {
5510 q = estimateDiv128To64( aSig0, aSig1, bSig );
5511 q = ( 2 < q ) ? q - 2 : 0;
5512 q >>= 64 - expDiff;
5513 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5514 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5516 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5517 ++q;
5518 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5521 else {
5522 term1 = 0;
5523 term0 = bSig;
5525 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5526 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5528 && ( q & 1 ) )
5530 aSig0 = alternateASig0;
5531 aSig1 = alternateASig1;
5532 zSign = ! zSign;
5534 return
5535 normalizeRoundAndPackFloatx80(
5536 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5540 /*----------------------------------------------------------------------------
5541 | Returns the square root of the extended double-precision floating-point
5542 | value `a'. The operation is performed according to the IEC/IEEE Standard
5543 | for Binary Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5546 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5548 flag aSign;
5549 int32 aExp, zExp;
5550 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5552 floatx80 z;
5554 aSig0 = extractFloatx80Frac( a );
5555 aExp = extractFloatx80Exp( a );
5556 aSign = extractFloatx80Sign( a );
5557 if ( aExp == 0x7FFF ) {
5558 if ((uint64_t)(aSig0 << 1)) {
5559 return propagateFloatx80NaN(a, a, status);
5561 if ( ! aSign ) return a;
5562 goto invalid;
5564 if ( aSign ) {
5565 if ( ( aExp | aSig0 ) == 0 ) return a;
5566 invalid:
5567 float_raise(float_flag_invalid, status);
5568 z.low = floatx80_default_nan_low;
5569 z.high = floatx80_default_nan_high;
5570 return z;
5572 if ( aExp == 0 ) {
5573 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5574 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5576 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5577 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5578 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5579 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5580 doubleZSig0 = zSig0<<1;
5581 mul64To128( zSig0, zSig0, &term0, &term1 );
5582 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5583 while ( (int64_t) rem0 < 0 ) {
5584 --zSig0;
5585 doubleZSig0 -= 2;
5586 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5588 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5589 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5590 if ( zSig1 == 0 ) zSig1 = 1;
5591 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5592 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5593 mul64To128( zSig1, zSig1, &term2, &term3 );
5594 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5595 while ( (int64_t) rem1 < 0 ) {
5596 --zSig1;
5597 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5598 term3 |= 1;
5599 term2 |= doubleZSig0;
5600 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5602 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5604 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5605 zSig0 |= doubleZSig0;
5606 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607 0, zExp, zSig0, zSig1, status);
5610 /*----------------------------------------------------------------------------
5611 | Returns 1 if the extended double-precision floating-point value `a' is equal
5612 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5613 | raised if either operand is a NaN. Otherwise, the comparison is performed
5614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5615 *----------------------------------------------------------------------------*/
5617 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5620 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5621 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5622 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5623 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5625 float_raise(float_flag_invalid, status);
5626 return 0;
5628 return
5629 ( a.low == b.low )
5630 && ( ( a.high == b.high )
5631 || ( ( a.low == 0 )
5632 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5637 /*----------------------------------------------------------------------------
5638 | Returns 1 if the extended double-precision floating-point value `a' is
5639 | less than or equal to the corresponding value `b', and 0 otherwise. The
5640 | invalid exception is raised if either operand is a NaN. The comparison is
5641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5642 | Arithmetic.
5643 *----------------------------------------------------------------------------*/
5645 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5647 flag aSign, bSign;
5649 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5650 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5651 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5652 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5654 float_raise(float_flag_invalid, status);
5655 return 0;
5657 aSign = extractFloatx80Sign( a );
5658 bSign = extractFloatx80Sign( b );
5659 if ( aSign != bSign ) {
5660 return
5661 aSign
5662 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5663 == 0 );
5665 return
5666 aSign ? le128( b.high, b.low, a.high, a.low )
5667 : le128( a.high, a.low, b.high, b.low );
5671 /*----------------------------------------------------------------------------
5672 | Returns 1 if the extended double-precision floating-point value `a' is
5673 | less than the corresponding value `b', and 0 otherwise. The invalid
5674 | exception is raised if either operand is a NaN. The comparison is performed
5675 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5678 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5680 flag aSign, bSign;
5682 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5683 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5684 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5685 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5687 float_raise(float_flag_invalid, status);
5688 return 0;
5690 aSign = extractFloatx80Sign( a );
5691 bSign = extractFloatx80Sign( b );
5692 if ( aSign != bSign ) {
5693 return
5694 aSign
5695 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5696 != 0 );
5698 return
5699 aSign ? lt128( b.high, b.low, a.high, a.low )
5700 : lt128( a.high, a.low, b.high, b.low );
5704 /*----------------------------------------------------------------------------
5705 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5706 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5707 | either operand is a NaN. The comparison is performed according to the
5708 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709 *----------------------------------------------------------------------------*/
5710 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5712 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5713 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5714 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5715 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5717 float_raise(float_flag_invalid, status);
5718 return 1;
5720 return 0;
5723 /*----------------------------------------------------------------------------
5724 | Returns 1 if the extended double-precision floating-point value `a' is
5725 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5726 | cause an exception. The comparison is performed according to the IEC/IEEE
5727 | Standard for Binary Floating-Point Arithmetic.
5728 *----------------------------------------------------------------------------*/
5730 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5733 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5734 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5735 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5736 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5738 if ( floatx80_is_signaling_nan( a )
5739 || floatx80_is_signaling_nan( b ) ) {
5740 float_raise(float_flag_invalid, status);
5742 return 0;
5744 return
5745 ( a.low == b.low )
5746 && ( ( a.high == b.high )
5747 || ( ( a.low == 0 )
5748 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5753 /*----------------------------------------------------------------------------
5754 | Returns 1 if the extended double-precision floating-point value `a' is less
5755 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5756 | do not cause an exception. Otherwise, the comparison is performed according
5757 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5758 *----------------------------------------------------------------------------*/
5760 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5762 flag aSign, bSign;
5764 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5765 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5766 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5767 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5769 if ( floatx80_is_signaling_nan( a )
5770 || floatx80_is_signaling_nan( b ) ) {
5771 float_raise(float_flag_invalid, status);
5773 return 0;
5775 aSign = extractFloatx80Sign( a );
5776 bSign = extractFloatx80Sign( b );
5777 if ( aSign != bSign ) {
5778 return
5779 aSign
5780 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781 == 0 );
5783 return
5784 aSign ? le128( b.high, b.low, a.high, a.low )
5785 : le128( a.high, a.low, b.high, b.low );
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the extended double-precision floating-point value `a' is less
5791 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5792 | an exception. Otherwise, the comparison is performed according to the
5793 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5798 flag aSign, bSign;
5800 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5801 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5803 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5805 if ( floatx80_is_signaling_nan( a )
5806 || floatx80_is_signaling_nan( b ) ) {
5807 float_raise(float_flag_invalid, status);
5809 return 0;
5811 aSign = extractFloatx80Sign( a );
5812 bSign = extractFloatx80Sign( b );
5813 if ( aSign != bSign ) {
5814 return
5815 aSign
5816 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5817 != 0 );
5819 return
5820 aSign ? lt128( b.high, b.low, a.high, a.low )
5821 : lt128( a.high, a.low, b.high, b.low );
5825 /*----------------------------------------------------------------------------
5826 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5827 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5828 | The comparison is performed according to the IEC/IEEE Standard for Binary
5829 | Floating-Point Arithmetic.
5830 *----------------------------------------------------------------------------*/
5831 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5833 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5834 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5835 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5836 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5838 if ( floatx80_is_signaling_nan( a )
5839 || floatx80_is_signaling_nan( b ) ) {
5840 float_raise(float_flag_invalid, status);
5842 return 1;
5844 return 0;
5847 /*----------------------------------------------------------------------------
5848 | Returns the result of converting the quadruple-precision floating-point
5849 | value `a' to the 32-bit two's complement integer format. The conversion
5850 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5851 | Arithmetic---which means in particular that the conversion is rounded
5852 | according to the current rounding mode. If `a' is a NaN, the largest
5853 | positive integer is returned. Otherwise, if the conversion overflows, the
5854 | largest integer with the same sign as `a' is returned.
5855 *----------------------------------------------------------------------------*/
5857 int32 float128_to_int32(float128 a, float_status *status)
5859 flag aSign;
5860 int32 aExp, shiftCount;
5861 uint64_t aSig0, aSig1;
5863 aSig1 = extractFloat128Frac1( a );
5864 aSig0 = extractFloat128Frac0( a );
5865 aExp = extractFloat128Exp( a );
5866 aSign = extractFloat128Sign( a );
5867 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5868 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5869 aSig0 |= ( aSig1 != 0 );
5870 shiftCount = 0x4028 - aExp;
5871 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5872 return roundAndPackInt32(aSign, aSig0, status);
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of converting the quadruple-precision floating-point
5878 | value `a' to the 32-bit two's complement integer format. The conversion
5879 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5880 | Arithmetic, except that the conversion is always rounded toward zero. If
5881 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5882 | conversion overflows, the largest integer with the same sign as `a' is
5883 | returned.
5884 *----------------------------------------------------------------------------*/
5886 int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
5888 flag aSign;
5889 int32 aExp, shiftCount;
5890 uint64_t aSig0, aSig1, savedASig;
5891 int32_t z;
5893 aSig1 = extractFloat128Frac1( a );
5894 aSig0 = extractFloat128Frac0( a );
5895 aExp = extractFloat128Exp( a );
5896 aSign = extractFloat128Sign( a );
5897 aSig0 |= ( aSig1 != 0 );
5898 if ( 0x401E < aExp ) {
5899 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5900 goto invalid;
5902 else if ( aExp < 0x3FFF ) {
5903 if (aExp || aSig0) {
5904 status->float_exception_flags |= float_flag_inexact;
5906 return 0;
5908 aSig0 |= LIT64( 0x0001000000000000 );
5909 shiftCount = 0x402F - aExp;
5910 savedASig = aSig0;
5911 aSig0 >>= shiftCount;
5912 z = aSig0;
5913 if ( aSign ) z = - z;
5914 if ( ( z < 0 ) ^ aSign ) {
5915 invalid:
5916 float_raise(float_flag_invalid, status);
5917 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5919 if ( ( aSig0<<shiftCount ) != savedASig ) {
5920 status->float_exception_flags |= float_flag_inexact;
5922 return z;
5926 /*----------------------------------------------------------------------------
5927 | Returns the result of converting the quadruple-precision floating-point
5928 | value `a' to the 64-bit two's complement integer format. The conversion
5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 | Arithmetic---which means in particular that the conversion is rounded
5931 | according to the current rounding mode. If `a' is a NaN, the largest
5932 | positive integer is returned. Otherwise, if the conversion overflows, the
5933 | largest integer with the same sign as `a' is returned.
5934 *----------------------------------------------------------------------------*/
5936 int64 float128_to_int64(float128 a, float_status *status)
5938 flag aSign;
5939 int32 aExp, shiftCount;
5940 uint64_t aSig0, aSig1;
5942 aSig1 = extractFloat128Frac1( a );
5943 aSig0 = extractFloat128Frac0( a );
5944 aExp = extractFloat128Exp( a );
5945 aSign = extractFloat128Sign( a );
5946 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5947 shiftCount = 0x402F - aExp;
5948 if ( shiftCount <= 0 ) {
5949 if ( 0x403E < aExp ) {
5950 float_raise(float_flag_invalid, status);
5951 if ( ! aSign
5952 || ( ( aExp == 0x7FFF )
5953 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5956 return LIT64( 0x7FFFFFFFFFFFFFFF );
5958 return (int64_t) LIT64( 0x8000000000000000 );
5960 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5962 else {
5963 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5965 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5969 /*----------------------------------------------------------------------------
5970 | Returns the result of converting the quadruple-precision floating-point
5971 | value `a' to the 64-bit two's complement integer format. The conversion
5972 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5973 | Arithmetic, except that the conversion is always rounded toward zero.
5974 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5975 | the conversion overflows, the largest integer with the same sign as `a' is
5976 | returned.
5977 *----------------------------------------------------------------------------*/
5979 int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
5981 flag aSign;
5982 int32 aExp, shiftCount;
5983 uint64_t aSig0, aSig1;
5984 int64 z;
5986 aSig1 = extractFloat128Frac1( a );
5987 aSig0 = extractFloat128Frac0( a );
5988 aExp = extractFloat128Exp( a );
5989 aSign = extractFloat128Sign( a );
5990 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5991 shiftCount = aExp - 0x402F;
5992 if ( 0 < shiftCount ) {
5993 if ( 0x403E <= aExp ) {
5994 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5995 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5996 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5997 if (aSig1) {
5998 status->float_exception_flags |= float_flag_inexact;
6001 else {
6002 float_raise(float_flag_invalid, status);
6003 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6004 return LIT64( 0x7FFFFFFFFFFFFFFF );
6007 return (int64_t) LIT64( 0x8000000000000000 );
6009 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6010 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6011 status->float_exception_flags |= float_flag_inexact;
6014 else {
6015 if ( aExp < 0x3FFF ) {
6016 if ( aExp | aSig0 | aSig1 ) {
6017 status->float_exception_flags |= float_flag_inexact;
6019 return 0;
6021 z = aSig0>>( - shiftCount );
6022 if ( aSig1
6023 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6024 status->float_exception_flags |= float_flag_inexact;
6027 if ( aSign ) z = - z;
6028 return z;
6032 /*----------------------------------------------------------------------------
6033 | Returns the result of converting the quadruple-precision floating-point
6034 | value `a' to the single-precision floating-point format. The conversion
6035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6036 | Arithmetic.
6037 *----------------------------------------------------------------------------*/
6039 float32 float128_to_float32(float128 a, float_status *status)
6041 flag aSign;
6042 int32 aExp;
6043 uint64_t aSig0, aSig1;
6044 uint32_t zSig;
6046 aSig1 = extractFloat128Frac1( a );
6047 aSig0 = extractFloat128Frac0( a );
6048 aExp = extractFloat128Exp( a );
6049 aSign = extractFloat128Sign( a );
6050 if ( aExp == 0x7FFF ) {
6051 if ( aSig0 | aSig1 ) {
6052 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6054 return packFloat32( aSign, 0xFF, 0 );
6056 aSig0 |= ( aSig1 != 0 );
6057 shift64RightJamming( aSig0, 18, &aSig0 );
6058 zSig = aSig0;
6059 if ( aExp || zSig ) {
6060 zSig |= 0x40000000;
6061 aExp -= 0x3F81;
6063 return roundAndPackFloat32(aSign, aExp, zSig, status);
6067 /*----------------------------------------------------------------------------
6068 | Returns the result of converting the quadruple-precision floating-point
6069 | value `a' to the double-precision floating-point format. The conversion
6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071 | Arithmetic.
6072 *----------------------------------------------------------------------------*/
6074 float64 float128_to_float64(float128 a, float_status *status)
6076 flag aSign;
6077 int32 aExp;
6078 uint64_t aSig0, aSig1;
6080 aSig1 = extractFloat128Frac1( a );
6081 aSig0 = extractFloat128Frac0( a );
6082 aExp = extractFloat128Exp( a );
6083 aSign = extractFloat128Sign( a );
6084 if ( aExp == 0x7FFF ) {
6085 if ( aSig0 | aSig1 ) {
6086 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6088 return packFloat64( aSign, 0x7FF, 0 );
6090 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6091 aSig0 |= ( aSig1 != 0 );
6092 if ( aExp || aSig0 ) {
6093 aSig0 |= LIT64( 0x4000000000000000 );
6094 aExp -= 0x3C01;
6096 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6100 /*----------------------------------------------------------------------------
6101 | Returns the result of converting the quadruple-precision floating-point
6102 | value `a' to the extended double-precision floating-point format. The
6103 | conversion is performed according to the IEC/IEEE Standard for Binary
6104 | Floating-Point Arithmetic.
6105 *----------------------------------------------------------------------------*/
6107 floatx80 float128_to_floatx80(float128 a, float_status *status)
6109 flag aSign;
6110 int32 aExp;
6111 uint64_t aSig0, aSig1;
6113 aSig1 = extractFloat128Frac1( a );
6114 aSig0 = extractFloat128Frac0( a );
6115 aExp = extractFloat128Exp( a );
6116 aSign = extractFloat128Sign( a );
6117 if ( aExp == 0x7FFF ) {
6118 if ( aSig0 | aSig1 ) {
6119 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6121 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6123 if ( aExp == 0 ) {
6124 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6125 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6127 else {
6128 aSig0 |= LIT64( 0x0001000000000000 );
6130 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6131 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6135 /*----------------------------------------------------------------------------
6136 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6137 | returns the result as a quadruple-precision floating-point value. The
6138 | operation is performed according to the IEC/IEEE Standard for Binary
6139 | Floating-Point Arithmetic.
6140 *----------------------------------------------------------------------------*/
6142 float128 float128_round_to_int(float128 a, float_status *status)
6144 flag aSign;
6145 int32 aExp;
6146 uint64_t lastBitMask, roundBitsMask;
6147 float128 z;
6149 aExp = extractFloat128Exp( a );
6150 if ( 0x402F <= aExp ) {
6151 if ( 0x406F <= aExp ) {
6152 if ( ( aExp == 0x7FFF )
6153 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6155 return propagateFloat128NaN(a, a, status);
6157 return a;
6159 lastBitMask = 1;
6160 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6161 roundBitsMask = lastBitMask - 1;
6162 z = a;
6163 switch (status->float_rounding_mode) {
6164 case float_round_nearest_even:
6165 if ( lastBitMask ) {
6166 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6167 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6169 else {
6170 if ( (int64_t) z.low < 0 ) {
6171 ++z.high;
6172 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6175 break;
6176 case float_round_ties_away:
6177 if (lastBitMask) {
6178 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6179 } else {
6180 if ((int64_t) z.low < 0) {
6181 ++z.high;
6184 break;
6185 case float_round_to_zero:
6186 break;
6187 case float_round_up:
6188 if (!extractFloat128Sign(z)) {
6189 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6191 break;
6192 case float_round_down:
6193 if (extractFloat128Sign(z)) {
6194 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6196 break;
6197 default:
6198 abort();
6200 z.low &= ~ roundBitsMask;
6202 else {
6203 if ( aExp < 0x3FFF ) {
6204 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6205 status->float_exception_flags |= float_flag_inexact;
6206 aSign = extractFloat128Sign( a );
6207 switch (status->float_rounding_mode) {
6208 case float_round_nearest_even:
6209 if ( ( aExp == 0x3FFE )
6210 && ( extractFloat128Frac0( a )
6211 | extractFloat128Frac1( a ) )
6213 return packFloat128( aSign, 0x3FFF, 0, 0 );
6215 break;
6216 case float_round_ties_away:
6217 if (aExp == 0x3FFE) {
6218 return packFloat128(aSign, 0x3FFF, 0, 0);
6220 break;
6221 case float_round_down:
6222 return
6223 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6224 : packFloat128( 0, 0, 0, 0 );
6225 case float_round_up:
6226 return
6227 aSign ? packFloat128( 1, 0, 0, 0 )
6228 : packFloat128( 0, 0x3FFF, 0, 0 );
6230 return packFloat128( aSign, 0, 0, 0 );
6232 lastBitMask = 1;
6233 lastBitMask <<= 0x402F - aExp;
6234 roundBitsMask = lastBitMask - 1;
6235 z.low = 0;
6236 z.high = a.high;
6237 switch (status->float_rounding_mode) {
6238 case float_round_nearest_even:
6239 z.high += lastBitMask>>1;
6240 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6241 z.high &= ~ lastBitMask;
6243 break;
6244 case float_round_ties_away:
6245 z.high += lastBitMask>>1;
6246 break;
6247 case float_round_to_zero:
6248 break;
6249 case float_round_up:
6250 if (!extractFloat128Sign(z)) {
6251 z.high |= ( a.low != 0 );
6252 z.high += roundBitsMask;
6254 break;
6255 case float_round_down:
6256 if (extractFloat128Sign(z)) {
6257 z.high |= (a.low != 0);
6258 z.high += roundBitsMask;
6260 break;
6261 default:
6262 abort();
6264 z.high &= ~ roundBitsMask;
6266 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6267 status->float_exception_flags |= float_flag_inexact;
6269 return z;
6273 /*----------------------------------------------------------------------------
6274 | Returns the result of adding the absolute values of the quadruple-precision
6275 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6276 | before being returned. `zSign' is ignored if the result is a NaN.
6277 | The addition is performed according to the IEC/IEEE Standard for Binary
6278 | Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6281 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6282 float_status *status)
6284 int32 aExp, bExp, zExp;
6285 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6286 int32 expDiff;
6288 aSig1 = extractFloat128Frac1( a );
6289 aSig0 = extractFloat128Frac0( a );
6290 aExp = extractFloat128Exp( a );
6291 bSig1 = extractFloat128Frac1( b );
6292 bSig0 = extractFloat128Frac0( b );
6293 bExp = extractFloat128Exp( b );
6294 expDiff = aExp - bExp;
6295 if ( 0 < expDiff ) {
6296 if ( aExp == 0x7FFF ) {
6297 if (aSig0 | aSig1) {
6298 return propagateFloat128NaN(a, b, status);
6300 return a;
6302 if ( bExp == 0 ) {
6303 --expDiff;
6305 else {
6306 bSig0 |= LIT64( 0x0001000000000000 );
6308 shift128ExtraRightJamming(
6309 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6310 zExp = aExp;
6312 else if ( expDiff < 0 ) {
6313 if ( bExp == 0x7FFF ) {
6314 if (bSig0 | bSig1) {
6315 return propagateFloat128NaN(a, b, status);
6317 return packFloat128( zSign, 0x7FFF, 0, 0 );
6319 if ( aExp == 0 ) {
6320 ++expDiff;
6322 else {
6323 aSig0 |= LIT64( 0x0001000000000000 );
6325 shift128ExtraRightJamming(
6326 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6327 zExp = bExp;
6329 else {
6330 if ( aExp == 0x7FFF ) {
6331 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6332 return propagateFloat128NaN(a, b, status);
6334 return a;
6336 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6337 if ( aExp == 0 ) {
6338 if (status->flush_to_zero) {
6339 if (zSig0 | zSig1) {
6340 float_raise(float_flag_output_denormal, status);
6342 return packFloat128(zSign, 0, 0, 0);
6344 return packFloat128( zSign, 0, zSig0, zSig1 );
6346 zSig2 = 0;
6347 zSig0 |= LIT64( 0x0002000000000000 );
6348 zExp = aExp;
6349 goto shiftRight1;
6351 aSig0 |= LIT64( 0x0001000000000000 );
6352 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6353 --zExp;
6354 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6355 ++zExp;
6356 shiftRight1:
6357 shift128ExtraRightJamming(
6358 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6359 roundAndPack:
6360 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of subtracting the absolute values of the quadruple-
6366 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6367 | difference is negated before being returned. `zSign' is ignored if the
6368 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6369 | Standard for Binary Floating-Point Arithmetic.
6370 *----------------------------------------------------------------------------*/
6372 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6373 float_status *status)
6375 int32 aExp, bExp, zExp;
6376 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6377 int32 expDiff;
6378 float128 z;
6380 aSig1 = extractFloat128Frac1( a );
6381 aSig0 = extractFloat128Frac0( a );
6382 aExp = extractFloat128Exp( a );
6383 bSig1 = extractFloat128Frac1( b );
6384 bSig0 = extractFloat128Frac0( b );
6385 bExp = extractFloat128Exp( b );
6386 expDiff = aExp - bExp;
6387 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6388 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6389 if ( 0 < expDiff ) goto aExpBigger;
6390 if ( expDiff < 0 ) goto bExpBigger;
6391 if ( aExp == 0x7FFF ) {
6392 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6393 return propagateFloat128NaN(a, b, status);
6395 float_raise(float_flag_invalid, status);
6396 z.low = float128_default_nan_low;
6397 z.high = float128_default_nan_high;
6398 return z;
6400 if ( aExp == 0 ) {
6401 aExp = 1;
6402 bExp = 1;
6404 if ( bSig0 < aSig0 ) goto aBigger;
6405 if ( aSig0 < bSig0 ) goto bBigger;
6406 if ( bSig1 < aSig1 ) goto aBigger;
6407 if ( aSig1 < bSig1 ) goto bBigger;
6408 return packFloat128(status->float_rounding_mode == float_round_down,
6409 0, 0, 0);
6410 bExpBigger:
6411 if ( bExp == 0x7FFF ) {
6412 if (bSig0 | bSig1) {
6413 return propagateFloat128NaN(a, b, status);
6415 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6417 if ( aExp == 0 ) {
6418 ++expDiff;
6420 else {
6421 aSig0 |= LIT64( 0x4000000000000000 );
6423 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424 bSig0 |= LIT64( 0x4000000000000000 );
6425 bBigger:
6426 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6427 zExp = bExp;
6428 zSign ^= 1;
6429 goto normalizeRoundAndPack;
6430 aExpBigger:
6431 if ( aExp == 0x7FFF ) {
6432 if (aSig0 | aSig1) {
6433 return propagateFloat128NaN(a, b, status);
6435 return a;
6437 if ( bExp == 0 ) {
6438 --expDiff;
6440 else {
6441 bSig0 |= LIT64( 0x4000000000000000 );
6443 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444 aSig0 |= LIT64( 0x4000000000000000 );
6445 aBigger:
6446 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6447 zExp = aExp;
6448 normalizeRoundAndPack:
6449 --zExp;
6450 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6451 status);
6455 /*----------------------------------------------------------------------------
6456 | Returns the result of adding the quadruple-precision floating-point values
6457 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6458 | for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6461 float128 float128_add(float128 a, float128 b, float_status *status)
6463 flag aSign, bSign;
6465 aSign = extractFloat128Sign( a );
6466 bSign = extractFloat128Sign( b );
6467 if ( aSign == bSign ) {
6468 return addFloat128Sigs(a, b, aSign, status);
6470 else {
6471 return subFloat128Sigs(a, b, aSign, status);
6476 /*----------------------------------------------------------------------------
6477 | Returns the result of subtracting the quadruple-precision floating-point
6478 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6479 | Standard for Binary Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6482 float128 float128_sub(float128 a, float128 b, float_status *status)
6484 flag aSign, bSign;
6486 aSign = extractFloat128Sign( a );
6487 bSign = extractFloat128Sign( b );
6488 if ( aSign == bSign ) {
6489 return subFloat128Sigs(a, b, aSign, status);
6491 else {
6492 return addFloat128Sigs(a, b, aSign, status);
6497 /*----------------------------------------------------------------------------
6498 | Returns the result of multiplying the quadruple-precision floating-point
6499 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6500 | Standard for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6503 float128 float128_mul(float128 a, float128 b, float_status *status)
6505 flag aSign, bSign, zSign;
6506 int32 aExp, bExp, zExp;
6507 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6508 float128 z;
6510 aSig1 = extractFloat128Frac1( a );
6511 aSig0 = extractFloat128Frac0( a );
6512 aExp = extractFloat128Exp( a );
6513 aSign = extractFloat128Sign( a );
6514 bSig1 = extractFloat128Frac1( b );
6515 bSig0 = extractFloat128Frac0( b );
6516 bExp = extractFloat128Exp( b );
6517 bSign = extractFloat128Sign( b );
6518 zSign = aSign ^ bSign;
6519 if ( aExp == 0x7FFF ) {
6520 if ( ( aSig0 | aSig1 )
6521 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522 return propagateFloat128NaN(a, b, status);
6524 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6525 return packFloat128( zSign, 0x7FFF, 0, 0 );
6527 if ( bExp == 0x7FFF ) {
6528 if (bSig0 | bSig1) {
6529 return propagateFloat128NaN(a, b, status);
6531 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6532 invalid:
6533 float_raise(float_flag_invalid, status);
6534 z.low = float128_default_nan_low;
6535 z.high = float128_default_nan_high;
6536 return z;
6538 return packFloat128( zSign, 0x7FFF, 0, 0 );
6540 if ( aExp == 0 ) {
6541 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6542 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6544 if ( bExp == 0 ) {
6545 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6546 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6548 zExp = aExp + bExp - 0x4000;
6549 aSig0 |= LIT64( 0x0001000000000000 );
6550 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6551 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6552 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6553 zSig2 |= ( zSig3 != 0 );
6554 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6555 shift128ExtraRightJamming(
6556 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6557 ++zExp;
6559 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6563 /*----------------------------------------------------------------------------
6564 | Returns the result of dividing the quadruple-precision floating-point value
6565 | `a' by the corresponding value `b'. The operation is performed according to
6566 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6567 *----------------------------------------------------------------------------*/
6569 float128 float128_div(float128 a, float128 b, float_status *status)
6571 flag aSign, bSign, zSign;
6572 int32 aExp, bExp, zExp;
6573 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6575 float128 z;
6577 aSig1 = extractFloat128Frac1( a );
6578 aSig0 = extractFloat128Frac0( a );
6579 aExp = extractFloat128Exp( a );
6580 aSign = extractFloat128Sign( a );
6581 bSig1 = extractFloat128Frac1( b );
6582 bSig0 = extractFloat128Frac0( b );
6583 bExp = extractFloat128Exp( b );
6584 bSign = extractFloat128Sign( b );
6585 zSign = aSign ^ bSign;
6586 if ( aExp == 0x7FFF ) {
6587 if (aSig0 | aSig1) {
6588 return propagateFloat128NaN(a, b, status);
6590 if ( bExp == 0x7FFF ) {
6591 if (bSig0 | bSig1) {
6592 return propagateFloat128NaN(a, b, status);
6594 goto invalid;
6596 return packFloat128( zSign, 0x7FFF, 0, 0 );
6598 if ( bExp == 0x7FFF ) {
6599 if (bSig0 | bSig1) {
6600 return propagateFloat128NaN(a, b, status);
6602 return packFloat128( zSign, 0, 0, 0 );
6604 if ( bExp == 0 ) {
6605 if ( ( bSig0 | bSig1 ) == 0 ) {
6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6607 invalid:
6608 float_raise(float_flag_invalid, status);
6609 z.low = float128_default_nan_low;
6610 z.high = float128_default_nan_high;
6611 return z;
6613 float_raise(float_flag_divbyzero, status);
6614 return packFloat128( zSign, 0x7FFF, 0, 0 );
6616 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6618 if ( aExp == 0 ) {
6619 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6620 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6622 zExp = aExp - bExp + 0x3FFD;
6623 shortShift128Left(
6624 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6625 shortShift128Left(
6626 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6627 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6628 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6629 ++zExp;
6631 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6632 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6633 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6634 while ( (int64_t) rem0 < 0 ) {
6635 --zSig0;
6636 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6638 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6639 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6640 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6641 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6642 while ( (int64_t) rem1 < 0 ) {
6643 --zSig1;
6644 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6646 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6648 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6649 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6653 /*----------------------------------------------------------------------------
6654 | Returns the remainder of the quadruple-precision floating-point value `a'
6655 | with respect to the corresponding value `b'. The operation is performed
6656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657 *----------------------------------------------------------------------------*/
6659 float128 float128_rem(float128 a, float128 b, float_status *status)
6661 flag aSign, zSign;
6662 int32 aExp, bExp, expDiff;
6663 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6664 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6665 int64_t sigMean0;
6666 float128 z;
6668 aSig1 = extractFloat128Frac1( a );
6669 aSig0 = extractFloat128Frac0( a );
6670 aExp = extractFloat128Exp( a );
6671 aSign = extractFloat128Sign( a );
6672 bSig1 = extractFloat128Frac1( b );
6673 bSig0 = extractFloat128Frac0( b );
6674 bExp = extractFloat128Exp( b );
6675 if ( aExp == 0x7FFF ) {
6676 if ( ( aSig0 | aSig1 )
6677 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6678 return propagateFloat128NaN(a, b, status);
6680 goto invalid;
6682 if ( bExp == 0x7FFF ) {
6683 if (bSig0 | bSig1) {
6684 return propagateFloat128NaN(a, b, status);
6686 return a;
6688 if ( bExp == 0 ) {
6689 if ( ( bSig0 | bSig1 ) == 0 ) {
6690 invalid:
6691 float_raise(float_flag_invalid, status);
6692 z.low = float128_default_nan_low;
6693 z.high = float128_default_nan_high;
6694 return z;
6696 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6698 if ( aExp == 0 ) {
6699 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6700 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6702 expDiff = aExp - bExp;
6703 if ( expDiff < -1 ) return a;
6704 shortShift128Left(
6705 aSig0 | LIT64( 0x0001000000000000 ),
6706 aSig1,
6707 15 - ( expDiff < 0 ),
6708 &aSig0,
6709 &aSig1
6711 shortShift128Left(
6712 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6713 q = le128( bSig0, bSig1, aSig0, aSig1 );
6714 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6715 expDiff -= 64;
6716 while ( 0 < expDiff ) {
6717 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718 q = ( 4 < q ) ? q - 4 : 0;
6719 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6720 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6721 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6722 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6723 expDiff -= 61;
6725 if ( -64 < expDiff ) {
6726 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6727 q = ( 4 < q ) ? q - 4 : 0;
6728 q >>= - expDiff;
6729 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6730 expDiff += 52;
6731 if ( expDiff < 0 ) {
6732 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6734 else {
6735 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6737 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6738 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6740 else {
6741 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6742 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6744 do {
6745 alternateASig0 = aSig0;
6746 alternateASig1 = aSig1;
6747 ++q;
6748 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6749 } while ( 0 <= (int64_t) aSig0 );
6750 add128(
6751 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6752 if ( ( sigMean0 < 0 )
6753 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6754 aSig0 = alternateASig0;
6755 aSig1 = alternateASig1;
6757 zSign = ( (int64_t) aSig0 < 0 );
6758 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6759 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6760 status);
6763 /*----------------------------------------------------------------------------
6764 | Returns the square root of the quadruple-precision floating-point value `a'.
6765 | The operation is performed according to the IEC/IEEE Standard for Binary
6766 | Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6769 float128 float128_sqrt(float128 a, float_status *status)
6771 flag aSign;
6772 int32 aExp, zExp;
6773 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6774 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6775 float128 z;
6777 aSig1 = extractFloat128Frac1( a );
6778 aSig0 = extractFloat128Frac0( a );
6779 aExp = extractFloat128Exp( a );
6780 aSign = extractFloat128Sign( a );
6781 if ( aExp == 0x7FFF ) {
6782 if (aSig0 | aSig1) {
6783 return propagateFloat128NaN(a, a, status);
6785 if ( ! aSign ) return a;
6786 goto invalid;
6788 if ( aSign ) {
6789 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6790 invalid:
6791 float_raise(float_flag_invalid, status);
6792 z.low = float128_default_nan_low;
6793 z.high = float128_default_nan_high;
6794 return z;
6796 if ( aExp == 0 ) {
6797 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6798 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6800 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6801 aSig0 |= LIT64( 0x0001000000000000 );
6802 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6803 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6804 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6805 doubleZSig0 = zSig0<<1;
6806 mul64To128( zSig0, zSig0, &term0, &term1 );
6807 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6808 while ( (int64_t) rem0 < 0 ) {
6809 --zSig0;
6810 doubleZSig0 -= 2;
6811 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6813 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6814 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6815 if ( zSig1 == 0 ) zSig1 = 1;
6816 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6817 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6818 mul64To128( zSig1, zSig1, &term2, &term3 );
6819 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6820 while ( (int64_t) rem1 < 0 ) {
6821 --zSig1;
6822 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6823 term3 |= 1;
6824 term2 |= doubleZSig0;
6825 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6827 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6829 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6830 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6834 /*----------------------------------------------------------------------------
6835 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6836 | the corresponding value `b', and 0 otherwise. The invalid exception is
6837 | raised if either operand is a NaN. Otherwise, the comparison is performed
6838 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6839 *----------------------------------------------------------------------------*/
6841 int float128_eq(float128 a, float128 b, float_status *status)
6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6849 float_raise(float_flag_invalid, status);
6850 return 0;
6852 return
6853 ( a.low == b.low )
6854 && ( ( a.high == b.high )
6855 || ( ( a.low == 0 )
6856 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6861 /*----------------------------------------------------------------------------
6862 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6863 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6864 | exception is raised if either operand is a NaN. The comparison is performed
6865 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6866 *----------------------------------------------------------------------------*/
6868 int float128_le(float128 a, float128 b, float_status *status)
6870 flag aSign, bSign;
6872 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6873 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6874 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6875 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6877 float_raise(float_flag_invalid, status);
6878 return 0;
6880 aSign = extractFloat128Sign( a );
6881 bSign = extractFloat128Sign( b );
6882 if ( aSign != bSign ) {
6883 return
6884 aSign
6885 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6886 == 0 );
6888 return
6889 aSign ? le128( b.high, b.low, a.high, a.low )
6890 : le128( a.high, a.low, b.high, b.low );
6894 /*----------------------------------------------------------------------------
6895 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6896 | the corresponding value `b', and 0 otherwise. The invalid exception is
6897 | raised if either operand is a NaN. The comparison is performed according
6898 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6899 *----------------------------------------------------------------------------*/
6901 int float128_lt(float128 a, float128 b, float_status *status)
6903 flag aSign, bSign;
6905 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6906 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6907 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6908 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6910 float_raise(float_flag_invalid, status);
6911 return 0;
6913 aSign = extractFloat128Sign( a );
6914 bSign = extractFloat128Sign( b );
6915 if ( aSign != bSign ) {
6916 return
6917 aSign
6918 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6919 != 0 );
6921 return
6922 aSign ? lt128( b.high, b.low, a.high, a.low )
6923 : lt128( a.high, a.low, b.high, b.low );
6927 /*----------------------------------------------------------------------------
6928 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6929 | be compared, and 0 otherwise. The invalid exception is raised if either
6930 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6931 | Standard for Binary Floating-Point Arithmetic.
6932 *----------------------------------------------------------------------------*/
6934 int float128_unordered(float128 a, float128 b, float_status *status)
6936 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6937 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6939 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6941 float_raise(float_flag_invalid, status);
6942 return 1;
6944 return 0;
6947 /*----------------------------------------------------------------------------
6948 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6949 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6950 | exception. The comparison is performed according to the IEC/IEEE Standard
6951 | for Binary Floating-Point Arithmetic.
6952 *----------------------------------------------------------------------------*/
6954 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6957 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6958 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6959 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6960 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6962 if ( float128_is_signaling_nan( a )
6963 || float128_is_signaling_nan( b ) ) {
6964 float_raise(float_flag_invalid, status);
6966 return 0;
6968 return
6969 ( a.low == b.low )
6970 && ( ( a.high == b.high )
6971 || ( ( a.low == 0 )
6972 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6977 /*----------------------------------------------------------------------------
6978 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6979 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6980 | cause an exception. Otherwise, the comparison is performed according to the
6981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6982 *----------------------------------------------------------------------------*/
6984 int float128_le_quiet(float128 a, float128 b, float_status *status)
6986 flag aSign, bSign;
6988 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6989 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6990 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6991 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6993 if ( float128_is_signaling_nan( a )
6994 || float128_is_signaling_nan( b ) ) {
6995 float_raise(float_flag_invalid, status);
6997 return 0;
6999 aSign = extractFloat128Sign( a );
7000 bSign = extractFloat128Sign( b );
7001 if ( aSign != bSign ) {
7002 return
7003 aSign
7004 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7005 == 0 );
7007 return
7008 aSign ? le128( b.high, b.low, a.high, a.low )
7009 : le128( a.high, a.low, b.high, b.low );
7013 /*----------------------------------------------------------------------------
7014 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7015 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7016 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7017 | Standard for Binary Floating-Point Arithmetic.
7018 *----------------------------------------------------------------------------*/
7020 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7022 flag aSign, bSign;
7024 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7025 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7026 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7027 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7029 if ( float128_is_signaling_nan( a )
7030 || float128_is_signaling_nan( b ) ) {
7031 float_raise(float_flag_invalid, status);
7033 return 0;
7035 aSign = extractFloat128Sign( a );
7036 bSign = extractFloat128Sign( b );
7037 if ( aSign != bSign ) {
7038 return
7039 aSign
7040 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7041 != 0 );
7043 return
7044 aSign ? lt128( b.high, b.low, a.high, a.low )
7045 : lt128( a.high, a.low, b.high, b.low );
7049 /*----------------------------------------------------------------------------
7050 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7051 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7052 | comparison is performed according to the IEC/IEEE Standard for Binary
7053 | Floating-Point Arithmetic.
7054 *----------------------------------------------------------------------------*/
7056 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7058 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7059 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7061 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7063 if ( float128_is_signaling_nan( a )
7064 || float128_is_signaling_nan( b ) ) {
7065 float_raise(float_flag_invalid, status);
7067 return 1;
7069 return 0;
7072 /* misc functions */
7073 float32 uint32_to_float32(uint32_t a, float_status *status)
7075 return int64_to_float32(a, status);
7078 float64 uint32_to_float64(uint32_t a, float_status *status)
7080 return int64_to_float64(a, status);
7083 uint32 float32_to_uint32(float32 a, float_status *status)
7085 int64_t v;
7086 uint32 res;
7087 int old_exc_flags = get_float_exception_flags(status);
7089 v = float32_to_int64(a, status);
7090 if (v < 0) {
7091 res = 0;
7092 } else if (v > 0xffffffff) {
7093 res = 0xffffffff;
7094 } else {
7095 return v;
7097 set_float_exception_flags(old_exc_flags, status);
7098 float_raise(float_flag_invalid, status);
7099 return res;
7102 uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
7104 int64_t v;
7105 uint32 res;
7106 int old_exc_flags = get_float_exception_flags(status);
7108 v = float32_to_int64_round_to_zero(a, status);
7109 if (v < 0) {
7110 res = 0;
7111 } else if (v > 0xffffffff) {
7112 res = 0xffffffff;
7113 } else {
7114 return v;
7116 set_float_exception_flags(old_exc_flags, status);
7117 float_raise(float_flag_invalid, status);
7118 return res;
7121 int_fast16_t float32_to_int16(float32 a, float_status *status)
7123 int32_t v;
7124 int_fast16_t res;
7125 int old_exc_flags = get_float_exception_flags(status);
7127 v = float32_to_int32(a, status);
7128 if (v < -0x8000) {
7129 res = -0x8000;
7130 } else if (v > 0x7fff) {
7131 res = 0x7fff;
7132 } else {
7133 return v;
7136 set_float_exception_flags(old_exc_flags, status);
7137 float_raise(float_flag_invalid, status);
7138 return res;
7141 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7143 int32_t v;
7144 uint_fast16_t res;
7145 int old_exc_flags = get_float_exception_flags(status);
7147 v = float32_to_int32(a, status);
7148 if (v < 0) {
7149 res = 0;
7150 } else if (v > 0xffff) {
7151 res = 0xffff;
7152 } else {
7153 return v;
7156 set_float_exception_flags(old_exc_flags, status);
7157 float_raise(float_flag_invalid, status);
7158 return res;
7161 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7163 int64_t v;
7164 uint_fast16_t res;
7165 int old_exc_flags = get_float_exception_flags(status);
7167 v = float32_to_int64_round_to_zero(a, status);
7168 if (v < 0) {
7169 res = 0;
7170 } else if (v > 0xffff) {
7171 res = 0xffff;
7172 } else {
7173 return v;
7175 set_float_exception_flags(old_exc_flags, status);
7176 float_raise(float_flag_invalid, status);
7177 return res;
7180 uint32 float64_to_uint32(float64 a, float_status *status)
7182 uint64_t v;
7183 uint32 res;
7184 int old_exc_flags = get_float_exception_flags(status);
7186 v = float64_to_uint64(a, status);
7187 if (v > 0xffffffff) {
7188 res = 0xffffffff;
7189 } else {
7190 return v;
7192 set_float_exception_flags(old_exc_flags, status);
7193 float_raise(float_flag_invalid, status);
7194 return res;
7197 uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
7199 uint64_t v;
7200 uint32 res;
7201 int old_exc_flags = get_float_exception_flags(status);
7203 v = float64_to_uint64_round_to_zero(a, status);
7204 if (v > 0xffffffff) {
7205 res = 0xffffffff;
7206 } else {
7207 return v;
7209 set_float_exception_flags(old_exc_flags, status);
7210 float_raise(float_flag_invalid, status);
7211 return res;
7214 int_fast16_t float64_to_int16(float64 a, float_status *status)
7216 int64_t v;
7217 int_fast16_t res;
7218 int old_exc_flags = get_float_exception_flags(status);
7220 v = float64_to_int32(a, status);
7221 if (v < -0x8000) {
7222 res = -0x8000;
7223 } else if (v > 0x7fff) {
7224 res = 0x7fff;
7225 } else {
7226 return v;
7229 set_float_exception_flags(old_exc_flags, status);
7230 float_raise(float_flag_invalid, status);
7231 return res;
7234 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7236 int64_t v;
7237 uint_fast16_t res;
7238 int old_exc_flags = get_float_exception_flags(status);
7240 v = float64_to_int32(a, status);
7241 if (v < 0) {
7242 res = 0;
7243 } else if (v > 0xffff) {
7244 res = 0xffff;
7245 } else {
7246 return v;
7249 set_float_exception_flags(old_exc_flags, status);
7250 float_raise(float_flag_invalid, status);
7251 return res;
7254 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7256 int64_t v;
7257 uint_fast16_t res;
7258 int old_exc_flags = get_float_exception_flags(status);
7260 v = float64_to_int64_round_to_zero(a, status);
7261 if (v < 0) {
7262 res = 0;
7263 } else if (v > 0xffff) {
7264 res = 0xffff;
7265 } else {
7266 return v;
7268 set_float_exception_flags(old_exc_flags, status);
7269 float_raise(float_flag_invalid, status);
7270 return res;
7273 /*----------------------------------------------------------------------------
7274 | Returns the result of converting the double-precision floating-point value
7275 | `a' to the 64-bit unsigned integer format. The conversion is
7276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7277 | Arithmetic---which means in particular that the conversion is rounded
7278 | according to the current rounding mode. If `a' is a NaN, the largest
7279 | positive integer is returned. If the conversion overflows, the
7280 | largest unsigned integer is returned. If 'a' is negative, the value is
7281 | rounded and zero is returned; negative values that do not round to zero
7282 | will raise the inexact exception.
7283 *----------------------------------------------------------------------------*/
7285 uint64_t float64_to_uint64(float64 a, float_status *status)
7287 flag aSign;
7288 int_fast16_t aExp, shiftCount;
7289 uint64_t aSig, aSigExtra;
7290 a = float64_squash_input_denormal(a, status);
7292 aSig = extractFloat64Frac(a);
7293 aExp = extractFloat64Exp(a);
7294 aSign = extractFloat64Sign(a);
7295 if (aSign && (aExp > 1022)) {
7296 float_raise(float_flag_invalid, status);
7297 if (float64_is_any_nan(a)) {
7298 return LIT64(0xFFFFFFFFFFFFFFFF);
7299 } else {
7300 return 0;
7303 if (aExp) {
7304 aSig |= LIT64(0x0010000000000000);
7306 shiftCount = 0x433 - aExp;
7307 if (shiftCount <= 0) {
7308 if (0x43E < aExp) {
7309 float_raise(float_flag_invalid, status);
7310 return LIT64(0xFFFFFFFFFFFFFFFF);
7312 aSigExtra = 0;
7313 aSig <<= -shiftCount;
7314 } else {
7315 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7317 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7320 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7322 signed char current_rounding_mode = status->float_rounding_mode;
7323 set_float_rounding_mode(float_round_to_zero, status);
7324 int64_t v = float64_to_uint64(a, status);
7325 set_float_rounding_mode(current_rounding_mode, status);
7326 return v;
7329 #define COMPARE(s, nan_exp) \
7330 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7331 int is_quiet, float_status *status) \
7333 flag aSign, bSign; \
7334 uint ## s ## _t av, bv; \
7335 a = float ## s ## _squash_input_denormal(a, status); \
7336 b = float ## s ## _squash_input_denormal(b, status); \
7338 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7339 extractFloat ## s ## Frac( a ) ) || \
7340 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7341 extractFloat ## s ## Frac( b ) )) { \
7342 if (!is_quiet || \
7343 float ## s ## _is_signaling_nan( a ) || \
7344 float ## s ## _is_signaling_nan( b ) ) { \
7345 float_raise(float_flag_invalid, status); \
7347 return float_relation_unordered; \
7349 aSign = extractFloat ## s ## Sign( a ); \
7350 bSign = extractFloat ## s ## Sign( b ); \
7351 av = float ## s ## _val(a); \
7352 bv = float ## s ## _val(b); \
7353 if ( aSign != bSign ) { \
7354 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7355 /* zero case */ \
7356 return float_relation_equal; \
7357 } else { \
7358 return 1 - (2 * aSign); \
7360 } else { \
7361 if (av == bv) { \
7362 return float_relation_equal; \
7363 } else { \
7364 return 1 - 2 * (aSign ^ ( av < bv )); \
7369 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7371 return float ## s ## _compare_internal(a, b, 0, status); \
7374 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7375 float_status *status) \
7377 return float ## s ## _compare_internal(a, b, 1, status); \
7380 COMPARE(32, 0xff)
7381 COMPARE(64, 0x7ff)
7383 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7384 int is_quiet, float_status *status)
7386 flag aSign, bSign;
7388 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7389 ( extractFloatx80Frac( a )<<1 ) ) ||
7390 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7391 ( extractFloatx80Frac( b )<<1 ) )) {
7392 if (!is_quiet ||
7393 floatx80_is_signaling_nan( a ) ||
7394 floatx80_is_signaling_nan( b ) ) {
7395 float_raise(float_flag_invalid, status);
7397 return float_relation_unordered;
7399 aSign = extractFloatx80Sign( a );
7400 bSign = extractFloatx80Sign( b );
7401 if ( aSign != bSign ) {
7403 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7404 ( ( a.low | b.low ) == 0 ) ) {
7405 /* zero case */
7406 return float_relation_equal;
7407 } else {
7408 return 1 - (2 * aSign);
7410 } else {
7411 if (a.low == b.low && a.high == b.high) {
7412 return float_relation_equal;
7413 } else {
7414 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7419 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7421 return floatx80_compare_internal(a, b, 0, status);
7424 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7426 return floatx80_compare_internal(a, b, 1, status);
7429 static inline int float128_compare_internal(float128 a, float128 b,
7430 int is_quiet, float_status *status)
7432 flag aSign, bSign;
7434 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7435 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7436 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7437 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7438 if (!is_quiet ||
7439 float128_is_signaling_nan( a ) ||
7440 float128_is_signaling_nan( b ) ) {
7441 float_raise(float_flag_invalid, status);
7443 return float_relation_unordered;
7445 aSign = extractFloat128Sign( a );
7446 bSign = extractFloat128Sign( b );
7447 if ( aSign != bSign ) {
7448 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7449 /* zero case */
7450 return float_relation_equal;
7451 } else {
7452 return 1 - (2 * aSign);
7454 } else {
7455 if (a.low == b.low && a.high == b.high) {
7456 return float_relation_equal;
7457 } else {
7458 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7463 int float128_compare(float128 a, float128 b, float_status *status)
7465 return float128_compare_internal(a, b, 0, status);
7468 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7470 return float128_compare_internal(a, b, 1, status);
7473 /* min() and max() functions. These can't be implemented as
7474 * 'compare and pick one input' because that would mishandle
7475 * NaNs and +0 vs -0.
7477 * minnum() and maxnum() functions. These are similar to the min()
7478 * and max() functions but if one of the arguments is a QNaN and
7479 * the other is numerical then the numerical argument is returned.
7480 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7481 * and maxNum() operations. min() and max() are the typical min/max
7482 * semantics provided by many CPUs which predate that specification.
7484 * minnummag() and maxnummag() functions correspond to minNumMag()
7485 * and minNumMag() from the IEEE-754 2008.
7487 #define MINMAX(s) \
7488 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7489 int ismin, int isieee, \
7490 int ismag, \
7491 float_status *status) \
7493 flag aSign, bSign; \
7494 uint ## s ## _t av, bv, aav, abv; \
7495 a = float ## s ## _squash_input_denormal(a, status); \
7496 b = float ## s ## _squash_input_denormal(b, status); \
7497 if (float ## s ## _is_any_nan(a) || \
7498 float ## s ## _is_any_nan(b)) { \
7499 if (isieee) { \
7500 if (float ## s ## _is_quiet_nan(a) && \
7501 !float ## s ##_is_any_nan(b)) { \
7502 return b; \
7503 } else if (float ## s ## _is_quiet_nan(b) && \
7504 !float ## s ## _is_any_nan(a)) { \
7505 return a; \
7508 return propagateFloat ## s ## NaN(a, b, status); \
7510 aSign = extractFloat ## s ## Sign(a); \
7511 bSign = extractFloat ## s ## Sign(b); \
7512 av = float ## s ## _val(a); \
7513 bv = float ## s ## _val(b); \
7514 if (ismag) { \
7515 aav = float ## s ## _abs(av); \
7516 abv = float ## s ## _abs(bv); \
7517 if (aav != abv) { \
7518 if (ismin) { \
7519 return (aav < abv) ? a : b; \
7520 } else { \
7521 return (aav < abv) ? b : a; \
7525 if (aSign != bSign) { \
7526 if (ismin) { \
7527 return aSign ? a : b; \
7528 } else { \
7529 return aSign ? b : a; \
7531 } else { \
7532 if (ismin) { \
7533 return (aSign ^ (av < bv)) ? a : b; \
7534 } else { \
7535 return (aSign ^ (av < bv)) ? b : a; \
7540 float ## s float ## s ## _min(float ## s a, float ## s b, \
7541 float_status *status) \
7543 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7546 float ## s float ## s ## _max(float ## s a, float ## s b, \
7547 float_status *status) \
7549 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7552 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7553 float_status *status) \
7555 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7558 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7559 float_status *status) \
7561 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7564 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7565 float_status *status) \
7567 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7570 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7571 float_status *status) \
7573 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7576 MINMAX(32)
7577 MINMAX(64)
7580 /* Multiply A by 2 raised to the power N. */
7581 float32 float32_scalbn(float32 a, int n, float_status *status)
7583 flag aSign;
7584 int16_t aExp;
7585 uint32_t aSig;
7587 a = float32_squash_input_denormal(a, status);
7588 aSig = extractFloat32Frac( a );
7589 aExp = extractFloat32Exp( a );
7590 aSign = extractFloat32Sign( a );
7592 if ( aExp == 0xFF ) {
7593 if ( aSig ) {
7594 return propagateFloat32NaN(a, a, status);
7596 return a;
7598 if (aExp != 0) {
7599 aSig |= 0x00800000;
7600 } else if (aSig == 0) {
7601 return a;
7602 } else {
7603 aExp++;
7606 if (n > 0x200) {
7607 n = 0x200;
7608 } else if (n < -0x200) {
7609 n = -0x200;
7612 aExp += n - 1;
7613 aSig <<= 7;
7614 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7617 float64 float64_scalbn(float64 a, int n, float_status *status)
7619 flag aSign;
7620 int16_t aExp;
7621 uint64_t aSig;
7623 a = float64_squash_input_denormal(a, status);
7624 aSig = extractFloat64Frac( a );
7625 aExp = extractFloat64Exp( a );
7626 aSign = extractFloat64Sign( a );
7628 if ( aExp == 0x7FF ) {
7629 if ( aSig ) {
7630 return propagateFloat64NaN(a, a, status);
7632 return a;
7634 if (aExp != 0) {
7635 aSig |= LIT64( 0x0010000000000000 );
7636 } else if (aSig == 0) {
7637 return a;
7638 } else {
7639 aExp++;
7642 if (n > 0x1000) {
7643 n = 0x1000;
7644 } else if (n < -0x1000) {
7645 n = -0x1000;
7648 aExp += n - 1;
7649 aSig <<= 10;
7650 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7653 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7655 flag aSign;
7656 int32_t aExp;
7657 uint64_t aSig;
7659 aSig = extractFloatx80Frac( a );
7660 aExp = extractFloatx80Exp( a );
7661 aSign = extractFloatx80Sign( a );
7663 if ( aExp == 0x7FFF ) {
7664 if ( aSig<<1 ) {
7665 return propagateFloatx80NaN(a, a, status);
7667 return a;
7670 if (aExp == 0) {
7671 if (aSig == 0) {
7672 return a;
7674 aExp++;
7677 if (n > 0x10000) {
7678 n = 0x10000;
7679 } else if (n < -0x10000) {
7680 n = -0x10000;
7683 aExp += n;
7684 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7685 aSign, aExp, aSig, 0, status);
7688 float128 float128_scalbn(float128 a, int n, float_status *status)
7690 flag aSign;
7691 int32_t aExp;
7692 uint64_t aSig0, aSig1;
7694 aSig1 = extractFloat128Frac1( a );
7695 aSig0 = extractFloat128Frac0( a );
7696 aExp = extractFloat128Exp( a );
7697 aSign = extractFloat128Sign( a );
7698 if ( aExp == 0x7FFF ) {
7699 if ( aSig0 | aSig1 ) {
7700 return propagateFloat128NaN(a, a, status);
7702 return a;
7704 if (aExp != 0) {
7705 aSig0 |= LIT64( 0x0001000000000000 );
7706 } else if (aSig0 == 0 && aSig1 == 0) {
7707 return a;
7708 } else {
7709 aExp++;
7712 if (n > 0x10000) {
7713 n = 0x10000;
7714 } else if (n < -0x10000) {
7715 n = -0x10000;
7718 aExp += n - 1;
7719 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7720 , status);