s390x/kvm: Fix diag-308 register decoding
[qemu/qmp-unstable.git] / fpu / softfloat.c
bloba1f1cb33b4e6d5278c766c95bc24efbcd2ad3455
1 /*
2 * QEMU float support
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
85 #include "config.h"
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
90 #include <stdlib.h>
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "softfloat-macros.h"
99 /*----------------------------------------------------------------------------
100 | Functions and definitions to determine: (1) whether tininess for underflow
101 | is detected before or after rounding by default, (2) what (if anything)
102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 | are propagated from function inputs to output. These details are target-
105 | specific.
106 *----------------------------------------------------------------------------*/
107 #include "softfloat-specialize.h"
109 /*----------------------------------------------------------------------------
110 | Returns the fraction bits of the half-precision floating-point value `a'.
111 *----------------------------------------------------------------------------*/
113 static inline uint32_t extractFloat16Frac(float16 a)
115 return float16_val(a) & 0x3ff;
118 /*----------------------------------------------------------------------------
119 | Returns the exponent bits of the half-precision floating-point value `a'.
120 *----------------------------------------------------------------------------*/
122 static inline int_fast16_t extractFloat16Exp(float16 a)
124 return (float16_val(a) >> 10) & 0x1f;
127 /*----------------------------------------------------------------------------
128 | Returns the sign bit of the single-precision floating-point value `a'.
129 *----------------------------------------------------------------------------*/
131 static inline flag extractFloat16Sign(float16 a)
133 return float16_val(a)>>15;
136 /*----------------------------------------------------------------------------
137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
139 | input. If `zSign' is 1, the input is negated before being converted to an
140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141 | is simply rounded to an integer, with the inexact exception raised if the
142 | input cannot be represented exactly as an integer. However, if the fixed-
143 | point input is too large, the invalid exception is raised and the largest
144 | positive or negative integer is returned.
145 *----------------------------------------------------------------------------*/
147 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
149 int8 roundingMode;
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
152 int32_t z;
154 roundingMode = STATUS(float_rounding_mode);
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
156 switch (roundingMode) {
157 case float_round_nearest_even:
158 case float_round_ties_away:
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
179 float_raise( float_flag_invalid STATUS_VAR);
180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
182 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
183 return z;
187 /*----------------------------------------------------------------------------
188 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
189 | `absZ1', with binary point between bits 63 and 64 (between the input words),
190 | and returns the properly rounded 64-bit integer corresponding to the input.
191 | If `zSign' is 1, the input is negated before being converted to an integer.
192 | Ordinarily, the fixed-point input is simply rounded to an integer, with
193 | the inexact exception raised if the input cannot be represented exactly as
194 | an integer. However, if the fixed-point input is too large, the invalid
195 | exception is raised and the largest positive or negative integer is
196 | returned.
197 *----------------------------------------------------------------------------*/
199 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
201 int8 roundingMode;
202 flag roundNearestEven, increment;
203 int64_t z;
205 roundingMode = STATUS(float_rounding_mode);
206 roundNearestEven = ( roundingMode == float_round_nearest_even );
207 switch (roundingMode) {
208 case float_round_nearest_even:
209 case float_round_ties_away:
210 increment = ((int64_t) absZ1 < 0);
211 break;
212 case float_round_to_zero:
213 increment = 0;
214 break;
215 case float_round_up:
216 increment = !zSign && absZ1;
217 break;
218 case float_round_down:
219 increment = zSign && absZ1;
220 break;
221 default:
222 abort();
224 if ( increment ) {
225 ++absZ0;
226 if ( absZ0 == 0 ) goto overflow;
227 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
229 z = absZ0;
230 if ( zSign ) z = - z;
231 if ( z && ( ( z < 0 ) ^ zSign ) ) {
232 overflow:
233 float_raise( float_flag_invalid STATUS_VAR);
234 return
235 zSign ? (int64_t) LIT64( 0x8000000000000000 )
236 : LIT64( 0x7FFFFFFFFFFFFFFF );
238 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
239 return z;
243 /*----------------------------------------------------------------------------
244 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
245 | `absZ1', with binary point between bits 63 and 64 (between the input words),
246 | and returns the properly rounded 64-bit unsigned integer corresponding to the
247 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
248 | with the inexact exception raised if the input cannot be represented exactly
249 | as an integer. However, if the fixed-point input is too large, the invalid
250 | exception is raised and the largest unsigned integer is returned.
251 *----------------------------------------------------------------------------*/
253 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
254 uint64_t absZ1 STATUS_PARAM)
256 int8 roundingMode;
257 flag roundNearestEven, increment;
259 roundingMode = STATUS(float_rounding_mode);
260 roundNearestEven = (roundingMode == float_round_nearest_even);
261 switch (roundingMode) {
262 case float_round_nearest_even:
263 case float_round_ties_away:
264 increment = ((int64_t)absZ1 < 0);
265 break;
266 case float_round_to_zero:
267 increment = 0;
268 break;
269 case float_round_up:
270 increment = !zSign && absZ1;
271 break;
272 case float_round_down:
273 increment = zSign && absZ1;
274 break;
275 default:
276 abort();
278 if (increment) {
279 ++absZ0;
280 if (absZ0 == 0) {
281 float_raise(float_flag_invalid STATUS_VAR);
282 return LIT64(0xFFFFFFFFFFFFFFFF);
284 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
287 if (zSign && absZ0) {
288 float_raise(float_flag_invalid STATUS_VAR);
289 return 0;
292 if (absZ1) {
293 STATUS(float_exception_flags) |= float_flag_inexact;
295 return absZ0;
298 /*----------------------------------------------------------------------------
299 | Returns the fraction bits of the single-precision floating-point value `a'.
300 *----------------------------------------------------------------------------*/
302 static inline uint32_t extractFloat32Frac( float32 a )
305 return float32_val(a) & 0x007FFFFF;
309 /*----------------------------------------------------------------------------
310 | Returns the exponent bits of the single-precision floating-point value `a'.
311 *----------------------------------------------------------------------------*/
313 static inline int_fast16_t extractFloat32Exp(float32 a)
316 return ( float32_val(a)>>23 ) & 0xFF;
320 /*----------------------------------------------------------------------------
321 | Returns the sign bit of the single-precision floating-point value `a'.
322 *----------------------------------------------------------------------------*/
324 static inline flag extractFloat32Sign( float32 a )
327 return float32_val(a)>>31;
331 /*----------------------------------------------------------------------------
332 | If `a' is denormal and we are in flush-to-zero mode then set the
333 | input-denormal exception and return zero. Otherwise just return the value.
334 *----------------------------------------------------------------------------*/
335 float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
337 if (STATUS(flush_inputs_to_zero)) {
338 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
339 float_raise(float_flag_input_denormal STATUS_VAR);
340 return make_float32(float32_val(a) & 0x80000000);
343 return a;
346 /*----------------------------------------------------------------------------
347 | Normalizes the subnormal single-precision floating-point value represented
348 | by the denormalized significand `aSig'. The normalized exponent and
349 | significand are stored at the locations pointed to by `zExpPtr' and
350 | `zSigPtr', respectively.
351 *----------------------------------------------------------------------------*/
353 static void
354 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
356 int8 shiftCount;
358 shiftCount = countLeadingZeros32( aSig ) - 8;
359 *zSigPtr = aSig<<shiftCount;
360 *zExpPtr = 1 - shiftCount;
364 /*----------------------------------------------------------------------------
365 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
366 | single-precision floating-point value, returning the result. After being
367 | shifted into the proper positions, the three fields are simply added
368 | together to form the result. This means that any integer portion of `zSig'
369 | will be added into the exponent. Since a properly normalized significand
370 | will have an integer portion equal to 1, the `zExp' input should be 1 less
371 | than the desired result exponent whenever `zSig' is a complete, normalized
372 | significand.
373 *----------------------------------------------------------------------------*/
375 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
378 return make_float32(
379 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
383 /*----------------------------------------------------------------------------
384 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385 | and significand `zSig', and returns the proper single-precision floating-
386 | point value corresponding to the abstract input. Ordinarily, the abstract
387 | value is simply rounded and packed into the single-precision format, with
388 | the inexact exception raised if the abstract input cannot be represented
389 | exactly. However, if the abstract value is too large, the overflow and
390 | inexact exceptions are raised and an infinity or maximal finite value is
391 | returned. If the abstract value is too small, the input value is rounded to
392 | a subnormal number, and the underflow and inexact exceptions are raised if
393 | the abstract input cannot be represented exactly as a subnormal single-
394 | precision floating-point number.
395 | The input significand `zSig' has its binary point between bits 30
396 | and 29, which is 7 bits to the left of the usual location. This shifted
397 | significand must be normalized or smaller. If `zSig' is not normalized,
398 | `zExp' must be 0; in that case, the result returned is a subnormal number,
399 | and it must not require rounding. In the usual case that `zSig' is
400 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
401 | The handling of underflow and overflow follows the IEC/IEEE Standard for
402 | Binary Floating-Point Arithmetic.
403 *----------------------------------------------------------------------------*/
405 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
407 int8 roundingMode;
408 flag roundNearestEven;
409 int8 roundIncrement, roundBits;
410 flag isTiny;
412 roundingMode = STATUS(float_rounding_mode);
413 roundNearestEven = ( roundingMode == float_round_nearest_even );
414 switch (roundingMode) {
415 case float_round_nearest_even:
416 case float_round_ties_away:
417 roundIncrement = 0x40;
418 break;
419 case float_round_to_zero:
420 roundIncrement = 0;
421 break;
422 case float_round_up:
423 roundIncrement = zSign ? 0 : 0x7f;
424 break;
425 case float_round_down:
426 roundIncrement = zSign ? 0x7f : 0;
427 break;
428 default:
429 abort();
430 break;
432 roundBits = zSig & 0x7F;
433 if ( 0xFD <= (uint16_t) zExp ) {
434 if ( ( 0xFD < zExp )
435 || ( ( zExp == 0xFD )
436 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
438 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
439 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
441 if ( zExp < 0 ) {
442 if (STATUS(flush_to_zero)) {
443 float_raise(float_flag_output_denormal STATUS_VAR);
444 return packFloat32(zSign, 0, 0);
446 isTiny =
447 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
448 || ( zExp < -1 )
449 || ( zSig + roundIncrement < 0x80000000 );
450 shift32RightJamming( zSig, - zExp, &zSig );
451 zExp = 0;
452 roundBits = zSig & 0x7F;
453 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
456 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
457 zSig = ( zSig + roundIncrement )>>7;
458 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
459 if ( zSig == 0 ) zExp = 0;
460 return packFloat32( zSign, zExp, zSig );
464 /*----------------------------------------------------------------------------
465 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
466 | and significand `zSig', and returns the proper single-precision floating-
467 | point value corresponding to the abstract input. This routine is just like
468 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
469 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
470 | floating-point exponent.
471 *----------------------------------------------------------------------------*/
473 static float32
474 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
476 int8 shiftCount;
478 shiftCount = countLeadingZeros32( zSig ) - 1;
479 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
483 /*----------------------------------------------------------------------------
484 | Returns the fraction bits of the double-precision floating-point value `a'.
485 *----------------------------------------------------------------------------*/
487 static inline uint64_t extractFloat64Frac( float64 a )
490 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
494 /*----------------------------------------------------------------------------
495 | Returns the exponent bits of the double-precision floating-point value `a'.
496 *----------------------------------------------------------------------------*/
498 static inline int_fast16_t extractFloat64Exp(float64 a)
501 return ( float64_val(a)>>52 ) & 0x7FF;
505 /*----------------------------------------------------------------------------
506 | Returns the sign bit of the double-precision floating-point value `a'.
507 *----------------------------------------------------------------------------*/
509 static inline flag extractFloat64Sign( float64 a )
512 return float64_val(a)>>63;
516 /*----------------------------------------------------------------------------
517 | If `a' is denormal and we are in flush-to-zero mode then set the
518 | input-denormal exception and return zero. Otherwise just return the value.
519 *----------------------------------------------------------------------------*/
520 float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
522 if (STATUS(flush_inputs_to_zero)) {
523 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
524 float_raise(float_flag_input_denormal STATUS_VAR);
525 return make_float64(float64_val(a) & (1ULL << 63));
528 return a;
531 /*----------------------------------------------------------------------------
532 | Normalizes the subnormal double-precision floating-point value represented
533 | by the denormalized significand `aSig'. The normalized exponent and
534 | significand are stored at the locations pointed to by `zExpPtr' and
535 | `zSigPtr', respectively.
536 *----------------------------------------------------------------------------*/
538 static void
539 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
541 int8 shiftCount;
543 shiftCount = countLeadingZeros64( aSig ) - 11;
544 *zSigPtr = aSig<<shiftCount;
545 *zExpPtr = 1 - shiftCount;
549 /*----------------------------------------------------------------------------
550 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
551 | double-precision floating-point value, returning the result. After being
552 | shifted into the proper positions, the three fields are simply added
553 | together to form the result. This means that any integer portion of `zSig'
554 | will be added into the exponent. Since a properly normalized significand
555 | will have an integer portion equal to 1, the `zExp' input should be 1 less
556 | than the desired result exponent whenever `zSig' is a complete, normalized
557 | significand.
558 *----------------------------------------------------------------------------*/
560 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
563 return make_float64(
564 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
568 /*----------------------------------------------------------------------------
569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570 | and significand `zSig', and returns the proper double-precision floating-
571 | point value corresponding to the abstract input. Ordinarily, the abstract
572 | value is simply rounded and packed into the double-precision format, with
573 | the inexact exception raised if the abstract input cannot be represented
574 | exactly. However, if the abstract value is too large, the overflow and
575 | inexact exceptions are raised and an infinity or maximal finite value is
576 | returned. If the abstract value is too small, the input value is rounded to
577 | a subnormal number, and the underflow and inexact exceptions are raised if
578 | the abstract input cannot be represented exactly as a subnormal double-
579 | precision floating-point number.
580 | The input significand `zSig' has its binary point between bits 62
581 | and 61, which is 10 bits to the left of the usual location. This shifted
582 | significand must be normalized or smaller. If `zSig' is not normalized,
583 | `zExp' must be 0; in that case, the result returned is a subnormal number,
584 | and it must not require rounding. In the usual case that `zSig' is
585 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
586 | The handling of underflow and overflow follows the IEC/IEEE Standard for
587 | Binary Floating-Point Arithmetic.
588 *----------------------------------------------------------------------------*/
590 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
592 int8 roundingMode;
593 flag roundNearestEven;
594 int_fast16_t roundIncrement, roundBits;
595 flag isTiny;
597 roundingMode = STATUS(float_rounding_mode);
598 roundNearestEven = ( roundingMode == float_round_nearest_even );
599 switch (roundingMode) {
600 case float_round_nearest_even:
601 case float_round_ties_away:
602 roundIncrement = 0x200;
603 break;
604 case float_round_to_zero:
605 roundIncrement = 0;
606 break;
607 case float_round_up:
608 roundIncrement = zSign ? 0 : 0x3ff;
609 break;
610 case float_round_down:
611 roundIncrement = zSign ? 0x3ff : 0;
612 break;
613 default:
614 abort();
616 roundBits = zSig & 0x3FF;
617 if ( 0x7FD <= (uint16_t) zExp ) {
618 if ( ( 0x7FD < zExp )
619 || ( ( zExp == 0x7FD )
620 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
622 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
623 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
625 if ( zExp < 0 ) {
626 if (STATUS(flush_to_zero)) {
627 float_raise(float_flag_output_denormal STATUS_VAR);
628 return packFloat64(zSign, 0, 0);
630 isTiny =
631 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
632 || ( zExp < -1 )
633 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
634 shift64RightJamming( zSig, - zExp, &zSig );
635 zExp = 0;
636 roundBits = zSig & 0x3FF;
637 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
640 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
641 zSig = ( zSig + roundIncrement )>>10;
642 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
643 if ( zSig == 0 ) zExp = 0;
644 return packFloat64( zSign, zExp, zSig );
648 /*----------------------------------------------------------------------------
649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
650 | and significand `zSig', and returns the proper double-precision floating-
651 | point value corresponding to the abstract input. This routine is just like
652 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
653 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
654 | floating-point exponent.
655 *----------------------------------------------------------------------------*/
657 static float64
658 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
660 int8 shiftCount;
662 shiftCount = countLeadingZeros64( zSig ) - 1;
663 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
667 /*----------------------------------------------------------------------------
668 | Returns the fraction bits of the extended double-precision floating-point
669 | value `a'.
670 *----------------------------------------------------------------------------*/
672 static inline uint64_t extractFloatx80Frac( floatx80 a )
675 return a.low;
679 /*----------------------------------------------------------------------------
680 | Returns the exponent bits of the extended double-precision floating-point
681 | value `a'.
682 *----------------------------------------------------------------------------*/
684 static inline int32 extractFloatx80Exp( floatx80 a )
687 return a.high & 0x7FFF;
691 /*----------------------------------------------------------------------------
692 | Returns the sign bit of the extended double-precision floating-point value
693 | `a'.
694 *----------------------------------------------------------------------------*/
696 static inline flag extractFloatx80Sign( floatx80 a )
699 return a.high>>15;
703 /*----------------------------------------------------------------------------
704 | Normalizes the subnormal extended double-precision floating-point value
705 | represented by the denormalized significand `aSig'. The normalized exponent
706 | and significand are stored at the locations pointed to by `zExpPtr' and
707 | `zSigPtr', respectively.
708 *----------------------------------------------------------------------------*/
710 static void
711 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
713 int8 shiftCount;
715 shiftCount = countLeadingZeros64( aSig );
716 *zSigPtr = aSig<<shiftCount;
717 *zExpPtr = 1 - shiftCount;
721 /*----------------------------------------------------------------------------
722 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
723 | extended double-precision floating-point value, returning the result.
724 *----------------------------------------------------------------------------*/
726 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
728 floatx80 z;
730 z.low = zSig;
731 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
732 return z;
736 /*----------------------------------------------------------------------------
737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
738 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
739 | and returns the proper extended double-precision floating-point value
740 | corresponding to the abstract input. Ordinarily, the abstract value is
741 | rounded and packed into the extended double-precision format, with the
742 | inexact exception raised if the abstract input cannot be represented
743 | exactly. However, if the abstract value is too large, the overflow and
744 | inexact exceptions are raised and an infinity or maximal finite value is
745 | returned. If the abstract value is too small, the input value is rounded to
746 | a subnormal number, and the underflow and inexact exceptions are raised if
747 | the abstract input cannot be represented exactly as a subnormal extended
748 | double-precision floating-point number.
749 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
750 | number of bits as single or double precision, respectively. Otherwise, the
751 | result is rounded to the full precision of the extended double-precision
752 | format.
753 | The input significand must be normalized or smaller. If the input
754 | significand is not normalized, `zExp' must be 0; in that case, the result
755 | returned is a subnormal number, and it must not require rounding. The
756 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
757 | Floating-Point Arithmetic.
758 *----------------------------------------------------------------------------*/
760 static floatx80
761 roundAndPackFloatx80(
762 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
763 STATUS_PARAM)
765 int8 roundingMode;
766 flag roundNearestEven, increment, isTiny;
767 int64 roundIncrement, roundMask, roundBits;
769 roundingMode = STATUS(float_rounding_mode);
770 roundNearestEven = ( roundingMode == float_round_nearest_even );
771 if ( roundingPrecision == 80 ) goto precision80;
772 if ( roundingPrecision == 64 ) {
773 roundIncrement = LIT64( 0x0000000000000400 );
774 roundMask = LIT64( 0x00000000000007FF );
776 else if ( roundingPrecision == 32 ) {
777 roundIncrement = LIT64( 0x0000008000000000 );
778 roundMask = LIT64( 0x000000FFFFFFFFFF );
780 else {
781 goto precision80;
783 zSig0 |= ( zSig1 != 0 );
784 switch (roundingMode) {
785 case float_round_nearest_even:
786 case float_round_ties_away:
787 break;
788 case float_round_to_zero:
789 roundIncrement = 0;
790 break;
791 case float_round_up:
792 roundIncrement = zSign ? 0 : roundMask;
793 break;
794 case float_round_down:
795 roundIncrement = zSign ? roundMask : 0;
796 break;
797 default:
798 abort();
800 roundBits = zSig0 & roundMask;
801 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
802 if ( ( 0x7FFE < zExp )
803 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
805 goto overflow;
807 if ( zExp <= 0 ) {
808 if (STATUS(flush_to_zero)) {
809 float_raise(float_flag_output_denormal STATUS_VAR);
810 return packFloatx80(zSign, 0, 0);
812 isTiny =
813 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
814 || ( zExp < 0 )
815 || ( zSig0 <= zSig0 + roundIncrement );
816 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
817 zExp = 0;
818 roundBits = zSig0 & roundMask;
819 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
820 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
821 zSig0 += roundIncrement;
822 if ( (int64_t) zSig0 < 0 ) zExp = 1;
823 roundIncrement = roundMask + 1;
824 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
825 roundMask |= roundIncrement;
827 zSig0 &= ~ roundMask;
828 return packFloatx80( zSign, zExp, zSig0 );
831 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
832 zSig0 += roundIncrement;
833 if ( zSig0 < roundIncrement ) {
834 ++zExp;
835 zSig0 = LIT64( 0x8000000000000000 );
837 roundIncrement = roundMask + 1;
838 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
839 roundMask |= roundIncrement;
841 zSig0 &= ~ roundMask;
842 if ( zSig0 == 0 ) zExp = 0;
843 return packFloatx80( zSign, zExp, zSig0 );
844 precision80:
845 switch (roundingMode) {
846 case float_round_nearest_even:
847 case float_round_ties_away:
848 increment = ((int64_t)zSig1 < 0);
849 break;
850 case float_round_to_zero:
851 increment = 0;
852 break;
853 case float_round_up:
854 increment = !zSign && zSig1;
855 break;
856 case float_round_down:
857 increment = zSign && zSig1;
858 break;
859 default:
860 abort();
862 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
863 if ( ( 0x7FFE < zExp )
864 || ( ( zExp == 0x7FFE )
865 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
866 && increment
869 roundMask = 0;
870 overflow:
871 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
872 if ( ( roundingMode == float_round_to_zero )
873 || ( zSign && ( roundingMode == float_round_up ) )
874 || ( ! zSign && ( roundingMode == float_round_down ) )
876 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
878 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
880 if ( zExp <= 0 ) {
881 isTiny =
882 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
883 || ( zExp < 0 )
884 || ! increment
885 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
886 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
887 zExp = 0;
888 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
889 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
890 switch (roundingMode) {
891 case float_round_nearest_even:
892 case float_round_ties_away:
893 increment = ((int64_t)zSig1 < 0);
894 break;
895 case float_round_to_zero:
896 increment = 0;
897 break;
898 case float_round_up:
899 increment = !zSign && zSig1;
900 break;
901 case float_round_down:
902 increment = zSign && zSig1;
903 break;
904 default:
905 abort();
907 if ( increment ) {
908 ++zSig0;
909 zSig0 &=
910 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
911 if ( (int64_t) zSig0 < 0 ) zExp = 1;
913 return packFloatx80( zSign, zExp, zSig0 );
916 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
917 if ( increment ) {
918 ++zSig0;
919 if ( zSig0 == 0 ) {
920 ++zExp;
921 zSig0 = LIT64( 0x8000000000000000 );
923 else {
924 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
927 else {
928 if ( zSig0 == 0 ) zExp = 0;
930 return packFloatx80( zSign, zExp, zSig0 );
934 /*----------------------------------------------------------------------------
935 | Takes an abstract floating-point value having sign `zSign', exponent
936 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
937 | and returns the proper extended double-precision floating-point value
938 | corresponding to the abstract input. This routine is just like
939 | `roundAndPackFloatx80' except that the input significand does not have to be
940 | normalized.
941 *----------------------------------------------------------------------------*/
943 static floatx80
944 normalizeRoundAndPackFloatx80(
945 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
946 STATUS_PARAM)
948 int8 shiftCount;
950 if ( zSig0 == 0 ) {
951 zSig0 = zSig1;
952 zSig1 = 0;
953 zExp -= 64;
955 shiftCount = countLeadingZeros64( zSig0 );
956 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
957 zExp -= shiftCount;
958 return
959 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
963 /*----------------------------------------------------------------------------
964 | Returns the least-significant 64 fraction bits of the quadruple-precision
965 | floating-point value `a'.
966 *----------------------------------------------------------------------------*/
968 static inline uint64_t extractFloat128Frac1( float128 a )
971 return a.low;
975 /*----------------------------------------------------------------------------
976 | Returns the most-significant 48 fraction bits of the quadruple-precision
977 | floating-point value `a'.
978 *----------------------------------------------------------------------------*/
980 static inline uint64_t extractFloat128Frac0( float128 a )
983 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
987 /*----------------------------------------------------------------------------
988 | Returns the exponent bits of the quadruple-precision floating-point value
989 | `a'.
990 *----------------------------------------------------------------------------*/
992 static inline int32 extractFloat128Exp( float128 a )
995 return ( a.high>>48 ) & 0x7FFF;
999 /*----------------------------------------------------------------------------
1000 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1001 *----------------------------------------------------------------------------*/
1003 static inline flag extractFloat128Sign( float128 a )
1006 return a.high>>63;
1010 /*----------------------------------------------------------------------------
1011 | Normalizes the subnormal quadruple-precision floating-point value
1012 | represented by the denormalized significand formed by the concatenation of
1013 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1014 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1015 | significand are stored at the location pointed to by `zSig0Ptr', and the
1016 | least significant 64 bits of the normalized significand are stored at the
1017 | location pointed to by `zSig1Ptr'.
1018 *----------------------------------------------------------------------------*/
1020 static void
1021 normalizeFloat128Subnormal(
1022 uint64_t aSig0,
1023 uint64_t aSig1,
1024 int32 *zExpPtr,
1025 uint64_t *zSig0Ptr,
1026 uint64_t *zSig1Ptr
1029 int8 shiftCount;
1031 if ( aSig0 == 0 ) {
1032 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1033 if ( shiftCount < 0 ) {
1034 *zSig0Ptr = aSig1>>( - shiftCount );
1035 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1037 else {
1038 *zSig0Ptr = aSig1<<shiftCount;
1039 *zSig1Ptr = 0;
1041 *zExpPtr = - shiftCount - 63;
1043 else {
1044 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1045 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1046 *zExpPtr = 1 - shiftCount;
1051 /*----------------------------------------------------------------------------
1052 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1053 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1054 | floating-point value, returning the result. After being shifted into the
1055 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1056 | added together to form the most significant 32 bits of the result. This
1057 | means that any integer portion of `zSig0' will be added into the exponent.
1058 | Since a properly normalized significand will have an integer portion equal
1059 | to 1, the `zExp' input should be 1 less than the desired result exponent
1060 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1061 | significand.
1062 *----------------------------------------------------------------------------*/
1064 static inline float128
1065 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1067 float128 z;
1069 z.low = zSig1;
1070 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1071 return z;
1075 /*----------------------------------------------------------------------------
1076 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1078 | and `zSig2', and returns the proper quadruple-precision floating-point value
1079 | corresponding to the abstract input. Ordinarily, the abstract value is
1080 | simply rounded and packed into the quadruple-precision format, with the
1081 | inexact exception raised if the abstract input cannot be represented
1082 | exactly. However, if the abstract value is too large, the overflow and
1083 | inexact exceptions are raised and an infinity or maximal finite value is
1084 | returned. If the abstract value is too small, the input value is rounded to
1085 | a subnormal number, and the underflow and inexact exceptions are raised if
1086 | the abstract input cannot be represented exactly as a subnormal quadruple-
1087 | precision floating-point number.
1088 | The input significand must be normalized or smaller. If the input
1089 | significand is not normalized, `zExp' must be 0; in that case, the result
1090 | returned is a subnormal number, and it must not require rounding. In the
1091 | usual case that the input significand is normalized, `zExp' must be 1 less
1092 | than the ``true'' floating-point exponent. The handling of underflow and
1093 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1094 *----------------------------------------------------------------------------*/
1096 static float128
1097 roundAndPackFloat128(
1098 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1100 int8 roundingMode;
1101 flag roundNearestEven, increment, isTiny;
1103 roundingMode = STATUS(float_rounding_mode);
1104 roundNearestEven = ( roundingMode == float_round_nearest_even );
1105 switch (roundingMode) {
1106 case float_round_nearest_even:
1107 case float_round_ties_away:
1108 increment = ((int64_t)zSig2 < 0);
1109 break;
1110 case float_round_to_zero:
1111 increment = 0;
1112 break;
1113 case float_round_up:
1114 increment = !zSign && zSig2;
1115 break;
1116 case float_round_down:
1117 increment = zSign && zSig2;
1118 break;
1119 default:
1120 abort();
1122 if ( 0x7FFD <= (uint32_t) zExp ) {
1123 if ( ( 0x7FFD < zExp )
1124 || ( ( zExp == 0x7FFD )
1125 && eq128(
1126 LIT64( 0x0001FFFFFFFFFFFF ),
1127 LIT64( 0xFFFFFFFFFFFFFFFF ),
1128 zSig0,
1129 zSig1
1131 && increment
1134 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1135 if ( ( roundingMode == float_round_to_zero )
1136 || ( zSign && ( roundingMode == float_round_up ) )
1137 || ( ! zSign && ( roundingMode == float_round_down ) )
1139 return
1140 packFloat128(
1141 zSign,
1142 0x7FFE,
1143 LIT64( 0x0000FFFFFFFFFFFF ),
1144 LIT64( 0xFFFFFFFFFFFFFFFF )
1147 return packFloat128( zSign, 0x7FFF, 0, 0 );
1149 if ( zExp < 0 ) {
1150 if (STATUS(flush_to_zero)) {
1151 float_raise(float_flag_output_denormal STATUS_VAR);
1152 return packFloat128(zSign, 0, 0, 0);
1154 isTiny =
1155 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1156 || ( zExp < -1 )
1157 || ! increment
1158 || lt128(
1159 zSig0,
1160 zSig1,
1161 LIT64( 0x0001FFFFFFFFFFFF ),
1162 LIT64( 0xFFFFFFFFFFFFFFFF )
1164 shift128ExtraRightJamming(
1165 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1166 zExp = 0;
1167 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1168 switch (roundingMode) {
1169 case float_round_nearest_even:
1170 case float_round_ties_away:
1171 increment = ((int64_t)zSig2 < 0);
1172 break;
1173 case float_round_to_zero:
1174 increment = 0;
1175 break;
1176 case float_round_up:
1177 increment = !zSign && zSig2;
1178 break;
1179 case float_round_down:
1180 increment = zSign && zSig2;
1181 break;
1182 default:
1183 abort();
1187 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1188 if ( increment ) {
1189 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1190 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1192 else {
1193 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1195 return packFloat128( zSign, zExp, zSig0, zSig1 );
1199 /*----------------------------------------------------------------------------
1200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1201 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1202 | returns the proper quadruple-precision floating-point value corresponding
1203 | to the abstract input. This routine is just like `roundAndPackFloat128'
1204 | except that the input significand has fewer bits and does not have to be
1205 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1206 | point exponent.
1207 *----------------------------------------------------------------------------*/
1209 static float128
1210 normalizeRoundAndPackFloat128(
1211 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1213 int8 shiftCount;
1214 uint64_t zSig2;
1216 if ( zSig0 == 0 ) {
1217 zSig0 = zSig1;
1218 zSig1 = 0;
1219 zExp -= 64;
1221 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1222 if ( 0 <= shiftCount ) {
1223 zSig2 = 0;
1224 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1226 else {
1227 shift128ExtraRightJamming(
1228 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1230 zExp -= shiftCount;
1231 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1235 /*----------------------------------------------------------------------------
1236 | Returns the result of converting the 32-bit two's complement integer `a'
1237 | to the single-precision floating-point format. The conversion is performed
1238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1239 *----------------------------------------------------------------------------*/
1241 float32 int32_to_float32(int32_t a STATUS_PARAM)
1243 flag zSign;
1245 if ( a == 0 ) return float32_zero;
1246 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1247 zSign = ( a < 0 );
1248 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 32-bit two's complement integer `a'
1254 | to the double-precision floating-point format. The conversion is performed
1255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256 *----------------------------------------------------------------------------*/
1258 float64 int32_to_float64(int32_t a STATUS_PARAM)
1260 flag zSign;
1261 uint32 absA;
1262 int8 shiftCount;
1263 uint64_t zSig;
1265 if ( a == 0 ) return float64_zero;
1266 zSign = ( a < 0 );
1267 absA = zSign ? - a : a;
1268 shiftCount = countLeadingZeros32( absA ) + 21;
1269 zSig = absA;
1270 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the extended double-precision floating-point format. The conversion
1277 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1278 | Arithmetic.
1279 *----------------------------------------------------------------------------*/
1281 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1283 flag zSign;
1284 uint32 absA;
1285 int8 shiftCount;
1286 uint64_t zSig;
1288 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1289 zSign = ( a < 0 );
1290 absA = zSign ? - a : a;
1291 shiftCount = countLeadingZeros32( absA ) + 32;
1292 zSig = absA;
1293 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1297 /*----------------------------------------------------------------------------
1298 | Returns the result of converting the 32-bit two's complement integer `a' to
1299 | the quadruple-precision floating-point format. The conversion is performed
1300 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301 *----------------------------------------------------------------------------*/
1303 float128 int32_to_float128(int32_t a STATUS_PARAM)
1305 flag zSign;
1306 uint32 absA;
1307 int8 shiftCount;
1308 uint64_t zSig0;
1310 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1311 zSign = ( a < 0 );
1312 absA = zSign ? - a : a;
1313 shiftCount = countLeadingZeros32( absA ) + 17;
1314 zSig0 = absA;
1315 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1319 /*----------------------------------------------------------------------------
1320 | Returns the result of converting the 64-bit two's complement integer `a'
1321 | to the single-precision floating-point format. The conversion is performed
1322 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1323 *----------------------------------------------------------------------------*/
1325 float32 int64_to_float32(int64_t a STATUS_PARAM)
1327 flag zSign;
1328 uint64 absA;
1329 int8 shiftCount;
1331 if ( a == 0 ) return float32_zero;
1332 zSign = ( a < 0 );
1333 absA = zSign ? - a : a;
1334 shiftCount = countLeadingZeros64( absA ) - 40;
1335 if ( 0 <= shiftCount ) {
1336 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1338 else {
1339 shiftCount += 7;
1340 if ( shiftCount < 0 ) {
1341 shift64RightJamming( absA, - shiftCount, &absA );
1343 else {
1344 absA <<= shiftCount;
1346 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the 64-bit two's complement integer `a'
1353 | to the double-precision floating-point format. The conversion is performed
1354 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1355 *----------------------------------------------------------------------------*/
1357 float64 int64_to_float64(int64_t a STATUS_PARAM)
1359 flag zSign;
1361 if ( a == 0 ) return float64_zero;
1362 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1363 return packFloat64( 1, 0x43E, 0 );
1365 zSign = ( a < 0 );
1366 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the 64-bit two's complement integer `a'
1372 | to the extended double-precision floating-point format. The conversion
1373 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic.
1375 *----------------------------------------------------------------------------*/
1377 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1379 flag zSign;
1380 uint64 absA;
1381 int8 shiftCount;
1383 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1384 zSign = ( a < 0 );
1385 absA = zSign ? - a : a;
1386 shiftCount = countLeadingZeros64( absA );
1387 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1391 /*----------------------------------------------------------------------------
1392 | Returns the result of converting the 64-bit two's complement integer `a' to
1393 | the quadruple-precision floating-point format. The conversion is performed
1394 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1395 *----------------------------------------------------------------------------*/
1397 float128 int64_to_float128(int64_t a STATUS_PARAM)
1399 flag zSign;
1400 uint64 absA;
1401 int8 shiftCount;
1402 int32 zExp;
1403 uint64_t zSig0, zSig1;
1405 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1406 zSign = ( a < 0 );
1407 absA = zSign ? - a : a;
1408 shiftCount = countLeadingZeros64( absA ) + 49;
1409 zExp = 0x406E - shiftCount;
1410 if ( 64 <= shiftCount ) {
1411 zSig1 = 0;
1412 zSig0 = absA;
1413 shiftCount -= 64;
1415 else {
1416 zSig1 = absA;
1417 zSig0 = 0;
1419 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1420 return packFloat128( zSign, zExp, zSig0, zSig1 );
1424 /*----------------------------------------------------------------------------
1425 | Returns the result of converting the 64-bit unsigned integer `a'
1426 | to the single-precision floating-point format. The conversion is performed
1427 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1428 *----------------------------------------------------------------------------*/
1430 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1432 int shiftcount;
1434 if (a == 0) {
1435 return float32_zero;
1438 /* Determine (left) shift needed to put first set bit into bit posn 23
1439 * (since packFloat32() expects the binary point between bits 23 and 22);
1440 * this is the fast case for smallish numbers.
1442 shiftcount = countLeadingZeros64(a) - 40;
1443 if (shiftcount >= 0) {
1444 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1446 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1447 * expects the binary point between bits 30 and 29, hence the + 7.
1449 shiftcount += 7;
1450 if (shiftcount < 0) {
1451 shift64RightJamming(a, -shiftcount, &a);
1452 } else {
1453 a <<= shiftcount;
1456 return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the 64-bit unsigned integer `a'
1461 | to the double-precision floating-point format. The conversion is performed
1462 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1463 *----------------------------------------------------------------------------*/
1465 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1467 int exp = 0x43C;
1468 int shiftcount;
1470 if (a == 0) {
1471 return float64_zero;
1474 shiftcount = countLeadingZeros64(a) - 1;
1475 if (shiftcount < 0) {
1476 shift64RightJamming(a, -shiftcount, &a);
1477 } else {
1478 a <<= shiftcount;
1480 return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1483 /*----------------------------------------------------------------------------
1484 | Returns the result of converting the 64-bit unsigned integer `a'
1485 | to the quadruple-precision floating-point format. The conversion is performed
1486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1487 *----------------------------------------------------------------------------*/
1489 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1491 if (a == 0) {
1492 return float128_zero;
1494 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 32-bit two's complement integer format. The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic---which means in particular that the conversion is rounded
1502 | according to the current rounding mode. If `a' is a NaN, the largest
1503 | positive integer is returned. Otherwise, if the conversion overflows, the
1504 | largest integer with the same sign as `a' is returned.
1505 *----------------------------------------------------------------------------*/
1507 int32 float32_to_int32( float32 a STATUS_PARAM )
1509 flag aSign;
1510 int_fast16_t aExp, shiftCount;
1511 uint32_t aSig;
1512 uint64_t aSig64;
1514 a = float32_squash_input_denormal(a STATUS_VAR);
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1519 if ( aExp ) aSig |= 0x00800000;
1520 shiftCount = 0xAF - aExp;
1521 aSig64 = aSig;
1522 aSig64 <<= 32;
1523 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1524 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1528 /*----------------------------------------------------------------------------
1529 | Returns the result of converting the single-precision floating-point value
1530 | `a' to the 32-bit two's complement integer format. The conversion is
1531 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1532 | Arithmetic, except that the conversion is always rounded toward zero.
1533 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1534 | the conversion overflows, the largest integer with the same sign as `a' is
1535 | returned.
1536 *----------------------------------------------------------------------------*/
1538 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1540 flag aSign;
1541 int_fast16_t aExp, shiftCount;
1542 uint32_t aSig;
1543 int32_t z;
1544 a = float32_squash_input_denormal(a STATUS_VAR);
1546 aSig = extractFloat32Frac( a );
1547 aExp = extractFloat32Exp( a );
1548 aSign = extractFloat32Sign( a );
1549 shiftCount = aExp - 0x9E;
1550 if ( 0 <= shiftCount ) {
1551 if ( float32_val(a) != 0xCF000000 ) {
1552 float_raise( float_flag_invalid STATUS_VAR);
1553 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1555 return (int32_t) 0x80000000;
1557 else if ( aExp <= 0x7E ) {
1558 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1559 return 0;
1561 aSig = ( aSig | 0x00800000 )<<8;
1562 z = aSig>>( - shiftCount );
1563 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1564 STATUS(float_exception_flags) |= float_flag_inexact;
1566 if ( aSign ) z = - z;
1567 return z;
1571 /*----------------------------------------------------------------------------
1572 | Returns the result of converting the single-precision floating-point value
1573 | `a' to the 16-bit two's complement integer format. The conversion is
1574 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1575 | Arithmetic, except that the conversion is always rounded toward zero.
1576 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1577 | the conversion overflows, the largest integer with the same sign as `a' is
1578 | returned.
1579 *----------------------------------------------------------------------------*/
1581 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1583 flag aSign;
1584 int_fast16_t aExp, shiftCount;
1585 uint32_t aSig;
1586 int32 z;
1588 aSig = extractFloat32Frac( a );
1589 aExp = extractFloat32Exp( a );
1590 aSign = extractFloat32Sign( a );
1591 shiftCount = aExp - 0x8E;
1592 if ( 0 <= shiftCount ) {
1593 if ( float32_val(a) != 0xC7000000 ) {
1594 float_raise( float_flag_invalid STATUS_VAR);
1595 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1596 return 0x7FFF;
1599 return (int32_t) 0xffff8000;
1601 else if ( aExp <= 0x7E ) {
1602 if ( aExp | aSig ) {
1603 STATUS(float_exception_flags) |= float_flag_inexact;
1605 return 0;
1607 shiftCount -= 0x10;
1608 aSig = ( aSig | 0x00800000 )<<8;
1609 z = aSig>>( - shiftCount );
1610 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1611 STATUS(float_exception_flags) |= float_flag_inexact;
1613 if ( aSign ) {
1614 z = - z;
1616 return z;
1620 /*----------------------------------------------------------------------------
1621 | Returns the result of converting the single-precision floating-point value
1622 | `a' to the 64-bit two's complement integer format. The conversion is
1623 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1624 | Arithmetic---which means in particular that the conversion is rounded
1625 | according to the current rounding mode. If `a' is a NaN, the largest
1626 | positive integer is returned. Otherwise, if the conversion overflows, the
1627 | largest integer with the same sign as `a' is returned.
1628 *----------------------------------------------------------------------------*/
1630 int64 float32_to_int64( float32 a STATUS_PARAM )
1632 flag aSign;
1633 int_fast16_t aExp, shiftCount;
1634 uint32_t aSig;
1635 uint64_t aSig64, aSigExtra;
1636 a = float32_squash_input_denormal(a STATUS_VAR);
1638 aSig = extractFloat32Frac( a );
1639 aExp = extractFloat32Exp( a );
1640 aSign = extractFloat32Sign( a );
1641 shiftCount = 0xBE - aExp;
1642 if ( shiftCount < 0 ) {
1643 float_raise( float_flag_invalid STATUS_VAR);
1644 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1645 return LIT64( 0x7FFFFFFFFFFFFFFF );
1647 return (int64_t) LIT64( 0x8000000000000000 );
1649 if ( aExp ) aSig |= 0x00800000;
1650 aSig64 = aSig;
1651 aSig64 <<= 40;
1652 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1653 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1657 /*----------------------------------------------------------------------------
1658 | Returns the result of converting the single-precision floating-point value
1659 | `a' to the 64-bit unsigned integer format. The conversion is
1660 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1661 | Arithmetic---which means in particular that the conversion is rounded
1662 | according to the current rounding mode. If `a' is a NaN, the largest
1663 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1664 | largest unsigned integer is returned. If the 'a' is negative, the result
1665 | is rounded and zero is returned; values that do not round to zero will
1666 | raise the inexact exception flag.
1667 *----------------------------------------------------------------------------*/
1669 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1671 flag aSign;
1672 int_fast16_t aExp, shiftCount;
1673 uint32_t aSig;
1674 uint64_t aSig64, aSigExtra;
1675 a = float32_squash_input_denormal(a STATUS_VAR);
1677 aSig = extractFloat32Frac(a);
1678 aExp = extractFloat32Exp(a);
1679 aSign = extractFloat32Sign(a);
1680 if ((aSign) && (aExp > 126)) {
1681 float_raise(float_flag_invalid STATUS_VAR);
1682 if (float32_is_any_nan(a)) {
1683 return LIT64(0xFFFFFFFFFFFFFFFF);
1684 } else {
1685 return 0;
1688 shiftCount = 0xBE - aExp;
1689 if (aExp) {
1690 aSig |= 0x00800000;
1692 if (shiftCount < 0) {
1693 float_raise(float_flag_invalid STATUS_VAR);
1694 return LIT64(0xFFFFFFFFFFFFFFFF);
1697 aSig64 = aSig;
1698 aSig64 <<= 40;
1699 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1700 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1703 /*----------------------------------------------------------------------------
1704 | Returns the result of converting the single-precision floating-point value
1705 | `a' to the 64-bit unsigned integer format. The conversion is
1706 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1707 | Arithmetic, except that the conversion is always rounded toward zero. If
1708 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1709 | conversion overflows, the largest unsigned integer is returned. If the
1710 | 'a' is negative, the result is rounded and zero is returned; values that do
1711 | not round to zero will raise the inexact flag.
1712 *----------------------------------------------------------------------------*/
1714 uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1716 signed char current_rounding_mode = STATUS(float_rounding_mode);
1717 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1718 int64_t v = float32_to_uint64(a STATUS_VAR);
1719 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1720 return v;
1723 /*----------------------------------------------------------------------------
1724 | Returns the result of converting the single-precision floating-point value
1725 | `a' to the 64-bit two's complement integer format. The conversion is
1726 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1727 | Arithmetic, except that the conversion is always rounded toward zero. If
1728 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1729 | conversion overflows, the largest integer with the same sign as `a' is
1730 | returned.
1731 *----------------------------------------------------------------------------*/
1733 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1735 flag aSign;
1736 int_fast16_t aExp, shiftCount;
1737 uint32_t aSig;
1738 uint64_t aSig64;
1739 int64 z;
1740 a = float32_squash_input_denormal(a STATUS_VAR);
1742 aSig = extractFloat32Frac( a );
1743 aExp = extractFloat32Exp( a );
1744 aSign = extractFloat32Sign( a );
1745 shiftCount = aExp - 0xBE;
1746 if ( 0 <= shiftCount ) {
1747 if ( float32_val(a) != 0xDF000000 ) {
1748 float_raise( float_flag_invalid STATUS_VAR);
1749 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1750 return LIT64( 0x7FFFFFFFFFFFFFFF );
1753 return (int64_t) LIT64( 0x8000000000000000 );
1755 else if ( aExp <= 0x7E ) {
1756 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1757 return 0;
1759 aSig64 = aSig | 0x00800000;
1760 aSig64 <<= 40;
1761 z = aSig64>>( - shiftCount );
1762 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1763 STATUS(float_exception_flags) |= float_flag_inexact;
1765 if ( aSign ) z = - z;
1766 return z;
1770 /*----------------------------------------------------------------------------
1771 | Returns the result of converting the single-precision floating-point value
1772 | `a' to the double-precision floating-point format. The conversion is
1773 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1774 | Arithmetic.
1775 *----------------------------------------------------------------------------*/
1777 float64 float32_to_float64( float32 a STATUS_PARAM )
1779 flag aSign;
1780 int_fast16_t aExp;
1781 uint32_t aSig;
1782 a = float32_squash_input_denormal(a STATUS_VAR);
1784 aSig = extractFloat32Frac( a );
1785 aExp = extractFloat32Exp( a );
1786 aSign = extractFloat32Sign( a );
1787 if ( aExp == 0xFF ) {
1788 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1789 return packFloat64( aSign, 0x7FF, 0 );
1791 if ( aExp == 0 ) {
1792 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1793 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1794 --aExp;
1796 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of converting the single-precision floating-point value
1802 | `a' to the extended double-precision floating-point format. The conversion
1803 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1804 | Arithmetic.
1805 *----------------------------------------------------------------------------*/
1807 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1809 flag aSign;
1810 int_fast16_t aExp;
1811 uint32_t aSig;
1813 a = float32_squash_input_denormal(a STATUS_VAR);
1814 aSig = extractFloat32Frac( a );
1815 aExp = extractFloat32Exp( a );
1816 aSign = extractFloat32Sign( a );
1817 if ( aExp == 0xFF ) {
1818 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1819 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1821 if ( aExp == 0 ) {
1822 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1823 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825 aSig |= 0x00800000;
1826 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1830 /*----------------------------------------------------------------------------
1831 | Returns the result of converting the single-precision floating-point value
1832 | `a' to the double-precision floating-point format. The conversion is
1833 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1834 | Arithmetic.
1835 *----------------------------------------------------------------------------*/
1837 float128 float32_to_float128( float32 a STATUS_PARAM )
1839 flag aSign;
1840 int_fast16_t aExp;
1841 uint32_t aSig;
1843 a = float32_squash_input_denormal(a STATUS_VAR);
1844 aSig = extractFloat32Frac( a );
1845 aExp = extractFloat32Exp( a );
1846 aSign = extractFloat32Sign( a );
1847 if ( aExp == 0xFF ) {
1848 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1849 return packFloat128( aSign, 0x7FFF, 0, 0 );
1851 if ( aExp == 0 ) {
1852 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1853 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1854 --aExp;
1856 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1860 /*----------------------------------------------------------------------------
1861 | Rounds the single-precision floating-point value `a' to an integer, and
1862 | returns the result as a single-precision floating-point value. The
1863 | operation is performed according to the IEC/IEEE Standard for Binary
1864 | Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1867 float32 float32_round_to_int( float32 a STATUS_PARAM)
1869 flag aSign;
1870 int_fast16_t aExp;
1871 uint32_t lastBitMask, roundBitsMask;
1872 uint32_t z;
1873 a = float32_squash_input_denormal(a STATUS_VAR);
1875 aExp = extractFloat32Exp( a );
1876 if ( 0x96 <= aExp ) {
1877 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1878 return propagateFloat32NaN( a, a STATUS_VAR );
1880 return a;
1882 if ( aExp <= 0x7E ) {
1883 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1884 STATUS(float_exception_flags) |= float_flag_inexact;
1885 aSign = extractFloat32Sign( a );
1886 switch ( STATUS(float_rounding_mode) ) {
1887 case float_round_nearest_even:
1888 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1889 return packFloat32( aSign, 0x7F, 0 );
1891 break;
1892 case float_round_ties_away:
1893 if (aExp == 0x7E) {
1894 return packFloat32(aSign, 0x7F, 0);
1896 break;
1897 case float_round_down:
1898 return make_float32(aSign ? 0xBF800000 : 0);
1899 case float_round_up:
1900 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1902 return packFloat32( aSign, 0, 0 );
1904 lastBitMask = 1;
1905 lastBitMask <<= 0x96 - aExp;
1906 roundBitsMask = lastBitMask - 1;
1907 z = float32_val(a);
1908 switch (STATUS(float_rounding_mode)) {
1909 case float_round_nearest_even:
1910 z += lastBitMask>>1;
1911 if ((z & roundBitsMask) == 0) {
1912 z &= ~lastBitMask;
1914 break;
1915 case float_round_ties_away:
1916 z += lastBitMask >> 1;
1917 break;
1918 case float_round_to_zero:
1919 break;
1920 case float_round_up:
1921 if (!extractFloat32Sign(make_float32(z))) {
1922 z += roundBitsMask;
1924 break;
1925 case float_round_down:
1926 if (extractFloat32Sign(make_float32(z))) {
1927 z += roundBitsMask;
1929 break;
1930 default:
1931 abort();
1933 z &= ~ roundBitsMask;
1934 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1935 return make_float32(z);
1939 /*----------------------------------------------------------------------------
1940 | Returns the result of adding the absolute values of the single-precision
1941 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1942 | before being returned. `zSign' is ignored if the result is a NaN.
1943 | The addition is performed according to the IEC/IEEE Standard for Binary
1944 | Floating-Point Arithmetic.
1945 *----------------------------------------------------------------------------*/
1947 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1949 int_fast16_t aExp, bExp, zExp;
1950 uint32_t aSig, bSig, zSig;
1951 int_fast16_t expDiff;
1953 aSig = extractFloat32Frac( a );
1954 aExp = extractFloat32Exp( a );
1955 bSig = extractFloat32Frac( b );
1956 bExp = extractFloat32Exp( b );
1957 expDiff = aExp - bExp;
1958 aSig <<= 6;
1959 bSig <<= 6;
1960 if ( 0 < expDiff ) {
1961 if ( aExp == 0xFF ) {
1962 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1963 return a;
1965 if ( bExp == 0 ) {
1966 --expDiff;
1968 else {
1969 bSig |= 0x20000000;
1971 shift32RightJamming( bSig, expDiff, &bSig );
1972 zExp = aExp;
1974 else if ( expDiff < 0 ) {
1975 if ( bExp == 0xFF ) {
1976 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1977 return packFloat32( zSign, 0xFF, 0 );
1979 if ( aExp == 0 ) {
1980 ++expDiff;
1982 else {
1983 aSig |= 0x20000000;
1985 shift32RightJamming( aSig, - expDiff, &aSig );
1986 zExp = bExp;
1988 else {
1989 if ( aExp == 0xFF ) {
1990 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1991 return a;
1993 if ( aExp == 0 ) {
1994 if (STATUS(flush_to_zero)) {
1995 if (aSig | bSig) {
1996 float_raise(float_flag_output_denormal STATUS_VAR);
1998 return packFloat32(zSign, 0, 0);
2000 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2002 zSig = 0x40000000 + aSig + bSig;
2003 zExp = aExp;
2004 goto roundAndPack;
2006 aSig |= 0x20000000;
2007 zSig = ( aSig + bSig )<<1;
2008 --zExp;
2009 if ( (int32_t) zSig < 0 ) {
2010 zSig = aSig + bSig;
2011 ++zExp;
2013 roundAndPack:
2014 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2018 /*----------------------------------------------------------------------------
2019 | Returns the result of subtracting the absolute values of the single-
2020 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2021 | difference is negated before being returned. `zSign' is ignored if the
2022 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2023 | Standard for Binary Floating-Point Arithmetic.
2024 *----------------------------------------------------------------------------*/
2026 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
2028 int_fast16_t aExp, bExp, zExp;
2029 uint32_t aSig, bSig, zSig;
2030 int_fast16_t expDiff;
2032 aSig = extractFloat32Frac( a );
2033 aExp = extractFloat32Exp( a );
2034 bSig = extractFloat32Frac( b );
2035 bExp = extractFloat32Exp( b );
2036 expDiff = aExp - bExp;
2037 aSig <<= 7;
2038 bSig <<= 7;
2039 if ( 0 < expDiff ) goto aExpBigger;
2040 if ( expDiff < 0 ) goto bExpBigger;
2041 if ( aExp == 0xFF ) {
2042 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2043 float_raise( float_flag_invalid STATUS_VAR);
2044 return float32_default_nan;
2046 if ( aExp == 0 ) {
2047 aExp = 1;
2048 bExp = 1;
2050 if ( bSig < aSig ) goto aBigger;
2051 if ( aSig < bSig ) goto bBigger;
2052 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2053 bExpBigger:
2054 if ( bExp == 0xFF ) {
2055 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2056 return packFloat32( zSign ^ 1, 0xFF, 0 );
2058 if ( aExp == 0 ) {
2059 ++expDiff;
2061 else {
2062 aSig |= 0x40000000;
2064 shift32RightJamming( aSig, - expDiff, &aSig );
2065 bSig |= 0x40000000;
2066 bBigger:
2067 zSig = bSig - aSig;
2068 zExp = bExp;
2069 zSign ^= 1;
2070 goto normalizeRoundAndPack;
2071 aExpBigger:
2072 if ( aExp == 0xFF ) {
2073 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074 return a;
2076 if ( bExp == 0 ) {
2077 --expDiff;
2079 else {
2080 bSig |= 0x40000000;
2082 shift32RightJamming( bSig, expDiff, &bSig );
2083 aSig |= 0x40000000;
2084 aBigger:
2085 zSig = aSig - bSig;
2086 zExp = aExp;
2087 normalizeRoundAndPack:
2088 --zExp;
2089 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2093 /*----------------------------------------------------------------------------
2094 | Returns the result of adding the single-precision floating-point values `a'
2095 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2096 | Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2099 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2101 flag aSign, bSign;
2102 a = float32_squash_input_denormal(a STATUS_VAR);
2103 b = float32_squash_input_denormal(b STATUS_VAR);
2105 aSign = extractFloat32Sign( a );
2106 bSign = extractFloat32Sign( b );
2107 if ( aSign == bSign ) {
2108 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2110 else {
2111 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2116 /*----------------------------------------------------------------------------
2117 | Returns the result of subtracting the single-precision floating-point values
2118 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2119 | for Binary Floating-Point Arithmetic.
2120 *----------------------------------------------------------------------------*/
2122 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2124 flag aSign, bSign;
2125 a = float32_squash_input_denormal(a STATUS_VAR);
2126 b = float32_squash_input_denormal(b STATUS_VAR);
2128 aSign = extractFloat32Sign( a );
2129 bSign = extractFloat32Sign( b );
2130 if ( aSign == bSign ) {
2131 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2133 else {
2134 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2139 /*----------------------------------------------------------------------------
2140 | Returns the result of multiplying the single-precision floating-point values
2141 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2142 | for Binary Floating-Point Arithmetic.
2143 *----------------------------------------------------------------------------*/
2145 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2147 flag aSign, bSign, zSign;
2148 int_fast16_t aExp, bExp, zExp;
2149 uint32_t aSig, bSig;
2150 uint64_t zSig64;
2151 uint32_t zSig;
2153 a = float32_squash_input_denormal(a STATUS_VAR);
2154 b = float32_squash_input_denormal(b STATUS_VAR);
2156 aSig = extractFloat32Frac( a );
2157 aExp = extractFloat32Exp( a );
2158 aSign = extractFloat32Sign( a );
2159 bSig = extractFloat32Frac( b );
2160 bExp = extractFloat32Exp( b );
2161 bSign = extractFloat32Sign( b );
2162 zSign = aSign ^ bSign;
2163 if ( aExp == 0xFF ) {
2164 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2165 return propagateFloat32NaN( a, b STATUS_VAR );
2167 if ( ( bExp | bSig ) == 0 ) {
2168 float_raise( float_flag_invalid STATUS_VAR);
2169 return float32_default_nan;
2171 return packFloat32( zSign, 0xFF, 0 );
2173 if ( bExp == 0xFF ) {
2174 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2175 if ( ( aExp | aSig ) == 0 ) {
2176 float_raise( float_flag_invalid STATUS_VAR);
2177 return float32_default_nan;
2179 return packFloat32( zSign, 0xFF, 0 );
2181 if ( aExp == 0 ) {
2182 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2183 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2185 if ( bExp == 0 ) {
2186 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2187 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2189 zExp = aExp + bExp - 0x7F;
2190 aSig = ( aSig | 0x00800000 )<<7;
2191 bSig = ( bSig | 0x00800000 )<<8;
2192 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2193 zSig = zSig64;
2194 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2195 zSig <<= 1;
2196 --zExp;
2198 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of dividing the single-precision floating-point value `a'
2204 | by the corresponding value `b'. The operation is performed according to the
2205 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2208 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2210 flag aSign, bSign, zSign;
2211 int_fast16_t aExp, bExp, zExp;
2212 uint32_t aSig, bSig, zSig;
2213 a = float32_squash_input_denormal(a STATUS_VAR);
2214 b = float32_squash_input_denormal(b STATUS_VAR);
2216 aSig = extractFloat32Frac( a );
2217 aExp = extractFloat32Exp( a );
2218 aSign = extractFloat32Sign( a );
2219 bSig = extractFloat32Frac( b );
2220 bExp = extractFloat32Exp( b );
2221 bSign = extractFloat32Sign( b );
2222 zSign = aSign ^ bSign;
2223 if ( aExp == 0xFF ) {
2224 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2225 if ( bExp == 0xFF ) {
2226 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2227 float_raise( float_flag_invalid STATUS_VAR);
2228 return float32_default_nan;
2230 return packFloat32( zSign, 0xFF, 0 );
2232 if ( bExp == 0xFF ) {
2233 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2234 return packFloat32( zSign, 0, 0 );
2236 if ( bExp == 0 ) {
2237 if ( bSig == 0 ) {
2238 if ( ( aExp | aSig ) == 0 ) {
2239 float_raise( float_flag_invalid STATUS_VAR);
2240 return float32_default_nan;
2242 float_raise( float_flag_divbyzero STATUS_VAR);
2243 return packFloat32( zSign, 0xFF, 0 );
2245 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2247 if ( aExp == 0 ) {
2248 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2249 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2251 zExp = aExp - bExp + 0x7D;
2252 aSig = ( aSig | 0x00800000 )<<7;
2253 bSig = ( bSig | 0x00800000 )<<8;
2254 if ( bSig <= ( aSig + aSig ) ) {
2255 aSig >>= 1;
2256 ++zExp;
2258 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2259 if ( ( zSig & 0x3F ) == 0 ) {
2260 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2262 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2266 /*----------------------------------------------------------------------------
2267 | Returns the remainder of the single-precision floating-point value `a'
2268 | with respect to the corresponding value `b'. The operation is performed
2269 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270 *----------------------------------------------------------------------------*/
2272 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2274 flag aSign, zSign;
2275 int_fast16_t aExp, bExp, expDiff;
2276 uint32_t aSig, bSig;
2277 uint32_t q;
2278 uint64_t aSig64, bSig64, q64;
2279 uint32_t alternateASig;
2280 int32_t sigMean;
2281 a = float32_squash_input_denormal(a STATUS_VAR);
2282 b = float32_squash_input_denormal(b STATUS_VAR);
2284 aSig = extractFloat32Frac( a );
2285 aExp = extractFloat32Exp( a );
2286 aSign = extractFloat32Sign( a );
2287 bSig = extractFloat32Frac( b );
2288 bExp = extractFloat32Exp( b );
2289 if ( aExp == 0xFF ) {
2290 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2291 return propagateFloat32NaN( a, b STATUS_VAR );
2293 float_raise( float_flag_invalid STATUS_VAR);
2294 return float32_default_nan;
2296 if ( bExp == 0xFF ) {
2297 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2298 return a;
2300 if ( bExp == 0 ) {
2301 if ( bSig == 0 ) {
2302 float_raise( float_flag_invalid STATUS_VAR);
2303 return float32_default_nan;
2305 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2307 if ( aExp == 0 ) {
2308 if ( aSig == 0 ) return a;
2309 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2311 expDiff = aExp - bExp;
2312 aSig |= 0x00800000;
2313 bSig |= 0x00800000;
2314 if ( expDiff < 32 ) {
2315 aSig <<= 8;
2316 bSig <<= 8;
2317 if ( expDiff < 0 ) {
2318 if ( expDiff < -1 ) return a;
2319 aSig >>= 1;
2321 q = ( bSig <= aSig );
2322 if ( q ) aSig -= bSig;
2323 if ( 0 < expDiff ) {
2324 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2325 q >>= 32 - expDiff;
2326 bSig >>= 2;
2327 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2329 else {
2330 aSig >>= 2;
2331 bSig >>= 2;
2334 else {
2335 if ( bSig <= aSig ) aSig -= bSig;
2336 aSig64 = ( (uint64_t) aSig )<<40;
2337 bSig64 = ( (uint64_t) bSig )<<40;
2338 expDiff -= 64;
2339 while ( 0 < expDiff ) {
2340 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2341 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2342 aSig64 = - ( ( bSig * q64 )<<38 );
2343 expDiff -= 62;
2345 expDiff += 64;
2346 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2347 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2348 q = q64>>( 64 - expDiff );
2349 bSig <<= 6;
2350 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2352 do {
2353 alternateASig = aSig;
2354 ++q;
2355 aSig -= bSig;
2356 } while ( 0 <= (int32_t) aSig );
2357 sigMean = aSig + alternateASig;
2358 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2359 aSig = alternateASig;
2361 zSign = ( (int32_t) aSig < 0 );
2362 if ( zSign ) aSig = - aSig;
2363 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2367 /*----------------------------------------------------------------------------
2368 | Returns the result of multiplying the single-precision floating-point values
2369 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2370 | multiplication. The operation is performed according to the IEC/IEEE
2371 | Standard for Binary Floating-Point Arithmetic 754-2008.
2372 | The flags argument allows the caller to select negation of the
2373 | addend, the intermediate product, or the final result. (The difference
2374 | between this and having the caller do a separate negation is that negating
2375 | externally will flip the sign bit on NaNs.)
2376 *----------------------------------------------------------------------------*/
2378 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2380 flag aSign, bSign, cSign, zSign;
2381 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2382 uint32_t aSig, bSig, cSig;
2383 flag pInf, pZero, pSign;
2384 uint64_t pSig64, cSig64, zSig64;
2385 uint32_t pSig;
2386 int shiftcount;
2387 flag signflip, infzero;
2389 a = float32_squash_input_denormal(a STATUS_VAR);
2390 b = float32_squash_input_denormal(b STATUS_VAR);
2391 c = float32_squash_input_denormal(c STATUS_VAR);
2392 aSig = extractFloat32Frac(a);
2393 aExp = extractFloat32Exp(a);
2394 aSign = extractFloat32Sign(a);
2395 bSig = extractFloat32Frac(b);
2396 bExp = extractFloat32Exp(b);
2397 bSign = extractFloat32Sign(b);
2398 cSig = extractFloat32Frac(c);
2399 cExp = extractFloat32Exp(c);
2400 cSign = extractFloat32Sign(c);
2402 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2403 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2405 /* It is implementation-defined whether the cases of (0,inf,qnan)
2406 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2407 * they return if they do), so we have to hand this information
2408 * off to the target-specific pick-a-NaN routine.
2410 if (((aExp == 0xff) && aSig) ||
2411 ((bExp == 0xff) && bSig) ||
2412 ((cExp == 0xff) && cSig)) {
2413 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2416 if (infzero) {
2417 float_raise(float_flag_invalid STATUS_VAR);
2418 return float32_default_nan;
2421 if (flags & float_muladd_negate_c) {
2422 cSign ^= 1;
2425 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2427 /* Work out the sign and type of the product */
2428 pSign = aSign ^ bSign;
2429 if (flags & float_muladd_negate_product) {
2430 pSign ^= 1;
2432 pInf = (aExp == 0xff) || (bExp == 0xff);
2433 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2435 if (cExp == 0xff) {
2436 if (pInf && (pSign ^ cSign)) {
2437 /* addition of opposite-signed infinities => InvalidOperation */
2438 float_raise(float_flag_invalid STATUS_VAR);
2439 return float32_default_nan;
2441 /* Otherwise generate an infinity of the same sign */
2442 return packFloat32(cSign ^ signflip, 0xff, 0);
2445 if (pInf) {
2446 return packFloat32(pSign ^ signflip, 0xff, 0);
2449 if (pZero) {
2450 if (cExp == 0) {
2451 if (cSig == 0) {
2452 /* Adding two exact zeroes */
2453 if (pSign == cSign) {
2454 zSign = pSign;
2455 } else if (STATUS(float_rounding_mode) == float_round_down) {
2456 zSign = 1;
2457 } else {
2458 zSign = 0;
2460 return packFloat32(zSign ^ signflip, 0, 0);
2462 /* Exact zero plus a denorm */
2463 if (STATUS(flush_to_zero)) {
2464 float_raise(float_flag_output_denormal STATUS_VAR);
2465 return packFloat32(cSign ^ signflip, 0, 0);
2468 /* Zero plus something non-zero : just return the something */
2469 if (flags & float_muladd_halve_result) {
2470 if (cExp == 0) {
2471 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2473 /* Subtract one to halve, and one again because roundAndPackFloat32
2474 * wants one less than the true exponent.
2476 cExp -= 2;
2477 cSig = (cSig | 0x00800000) << 7;
2478 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2480 return packFloat32(cSign ^ signflip, cExp, cSig);
2483 if (aExp == 0) {
2484 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2486 if (bExp == 0) {
2487 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2490 /* Calculate the actual result a * b + c */
2492 /* Multiply first; this is easy. */
2493 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2494 * because we want the true exponent, not the "one-less-than"
2495 * flavour that roundAndPackFloat32() takes.
2497 pExp = aExp + bExp - 0x7e;
2498 aSig = (aSig | 0x00800000) << 7;
2499 bSig = (bSig | 0x00800000) << 8;
2500 pSig64 = (uint64_t)aSig * bSig;
2501 if ((int64_t)(pSig64 << 1) >= 0) {
2502 pSig64 <<= 1;
2503 pExp--;
2506 zSign = pSign ^ signflip;
2508 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2509 * position 62.
2511 if (cExp == 0) {
2512 if (!cSig) {
2513 /* Throw out the special case of c being an exact zero now */
2514 shift64RightJamming(pSig64, 32, &pSig64);
2515 pSig = pSig64;
2516 if (flags & float_muladd_halve_result) {
2517 pExp--;
2519 return roundAndPackFloat32(zSign, pExp - 1,
2520 pSig STATUS_VAR);
2522 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2525 cSig64 = (uint64_t)cSig << (62 - 23);
2526 cSig64 |= LIT64(0x4000000000000000);
2527 expDiff = pExp - cExp;
2529 if (pSign == cSign) {
2530 /* Addition */
2531 if (expDiff > 0) {
2532 /* scale c to match p */
2533 shift64RightJamming(cSig64, expDiff, &cSig64);
2534 zExp = pExp;
2535 } else if (expDiff < 0) {
2536 /* scale p to match c */
2537 shift64RightJamming(pSig64, -expDiff, &pSig64);
2538 zExp = cExp;
2539 } else {
2540 /* no scaling needed */
2541 zExp = cExp;
2543 /* Add significands and make sure explicit bit ends up in posn 62 */
2544 zSig64 = pSig64 + cSig64;
2545 if ((int64_t)zSig64 < 0) {
2546 shift64RightJamming(zSig64, 1, &zSig64);
2547 } else {
2548 zExp--;
2550 } else {
2551 /* Subtraction */
2552 if (expDiff > 0) {
2553 shift64RightJamming(cSig64, expDiff, &cSig64);
2554 zSig64 = pSig64 - cSig64;
2555 zExp = pExp;
2556 } else if (expDiff < 0) {
2557 shift64RightJamming(pSig64, -expDiff, &pSig64);
2558 zSig64 = cSig64 - pSig64;
2559 zExp = cExp;
2560 zSign ^= 1;
2561 } else {
2562 zExp = pExp;
2563 if (cSig64 < pSig64) {
2564 zSig64 = pSig64 - cSig64;
2565 } else if (pSig64 < cSig64) {
2566 zSig64 = cSig64 - pSig64;
2567 zSign ^= 1;
2568 } else {
2569 /* Exact zero */
2570 zSign = signflip;
2571 if (STATUS(float_rounding_mode) == float_round_down) {
2572 zSign ^= 1;
2574 return packFloat32(zSign, 0, 0);
2577 --zExp;
2578 /* Normalize to put the explicit bit back into bit 62. */
2579 shiftcount = countLeadingZeros64(zSig64) - 1;
2580 zSig64 <<= shiftcount;
2581 zExp -= shiftcount;
2583 if (flags & float_muladd_halve_result) {
2584 zExp--;
2587 shift64RightJamming(zSig64, 32, &zSig64);
2588 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2592 /*----------------------------------------------------------------------------
2593 | Returns the square root of the single-precision floating-point value `a'.
2594 | The operation is performed according to the IEC/IEEE Standard for Binary
2595 | Floating-Point Arithmetic.
2596 *----------------------------------------------------------------------------*/
2598 float32 float32_sqrt( float32 a STATUS_PARAM )
2600 flag aSign;
2601 int_fast16_t aExp, zExp;
2602 uint32_t aSig, zSig;
2603 uint64_t rem, term;
2604 a = float32_squash_input_denormal(a STATUS_VAR);
2606 aSig = extractFloat32Frac( a );
2607 aExp = extractFloat32Exp( a );
2608 aSign = extractFloat32Sign( a );
2609 if ( aExp == 0xFF ) {
2610 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2611 if ( ! aSign ) return a;
2612 float_raise( float_flag_invalid STATUS_VAR);
2613 return float32_default_nan;
2615 if ( aSign ) {
2616 if ( ( aExp | aSig ) == 0 ) return a;
2617 float_raise( float_flag_invalid STATUS_VAR);
2618 return float32_default_nan;
2620 if ( aExp == 0 ) {
2621 if ( aSig == 0 ) return float32_zero;
2622 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2624 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2625 aSig = ( aSig | 0x00800000 )<<8;
2626 zSig = estimateSqrt32( aExp, aSig ) + 2;
2627 if ( ( zSig & 0x7F ) <= 5 ) {
2628 if ( zSig < 2 ) {
2629 zSig = 0x7FFFFFFF;
2630 goto roundAndPack;
2632 aSig >>= aExp & 1;
2633 term = ( (uint64_t) zSig ) * zSig;
2634 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2635 while ( (int64_t) rem < 0 ) {
2636 --zSig;
2637 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2639 zSig |= ( rem != 0 );
2641 shift32RightJamming( zSig, 1, &zSig );
2642 roundAndPack:
2643 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2647 /*----------------------------------------------------------------------------
2648 | Returns the binary exponential of the single-precision floating-point value
2649 | `a'. The operation is performed according to the IEC/IEEE Standard for
2650 | Binary Floating-Point Arithmetic.
2652 | Uses the following identities:
2654 | 1. -------------------------------------------------------------------------
2655 | x x*ln(2)
2656 | 2 = e
2658 | 2. -------------------------------------------------------------------------
2659 | 2 3 4 5 n
2660 | x x x x x x x
2661 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2662 | 1! 2! 3! 4! 5! n!
2663 *----------------------------------------------------------------------------*/
2665 static const float64 float32_exp2_coefficients[15] =
2667 const_float64( 0x3ff0000000000000ll ), /* 1 */
2668 const_float64( 0x3fe0000000000000ll ), /* 2 */
2669 const_float64( 0x3fc5555555555555ll ), /* 3 */
2670 const_float64( 0x3fa5555555555555ll ), /* 4 */
2671 const_float64( 0x3f81111111111111ll ), /* 5 */
2672 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2673 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2674 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2675 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2676 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2677 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2678 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2679 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2680 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2681 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2684 float32 float32_exp2( float32 a STATUS_PARAM )
2686 flag aSign;
2687 int_fast16_t aExp;
2688 uint32_t aSig;
2689 float64 r, x, xn;
2690 int i;
2691 a = float32_squash_input_denormal(a STATUS_VAR);
2693 aSig = extractFloat32Frac( a );
2694 aExp = extractFloat32Exp( a );
2695 aSign = extractFloat32Sign( a );
2697 if ( aExp == 0xFF) {
2698 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2699 return (aSign) ? float32_zero : a;
2701 if (aExp == 0) {
2702 if (aSig == 0) return float32_one;
2705 float_raise( float_flag_inexact STATUS_VAR);
2707 /* ******************************* */
2708 /* using float64 for approximation */
2709 /* ******************************* */
2710 x = float32_to_float64(a STATUS_VAR);
2711 x = float64_mul(x, float64_ln2 STATUS_VAR);
2713 xn = x;
2714 r = float64_one;
2715 for (i = 0 ; i < 15 ; i++) {
2716 float64 f;
2718 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2719 r = float64_add(r, f STATUS_VAR);
2721 xn = float64_mul(xn, x STATUS_VAR);
2724 return float64_to_float32(r, status);
2727 /*----------------------------------------------------------------------------
2728 | Returns the binary log of the single-precision floating-point value `a'.
2729 | The operation is performed according to the IEC/IEEE Standard for Binary
2730 | Floating-Point Arithmetic.
2731 *----------------------------------------------------------------------------*/
2732 float32 float32_log2( float32 a STATUS_PARAM )
2734 flag aSign, zSign;
2735 int_fast16_t aExp;
2736 uint32_t aSig, zSig, i;
2738 a = float32_squash_input_denormal(a STATUS_VAR);
2739 aSig = extractFloat32Frac( a );
2740 aExp = extractFloat32Exp( a );
2741 aSign = extractFloat32Sign( a );
2743 if ( aExp == 0 ) {
2744 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2745 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2747 if ( aSign ) {
2748 float_raise( float_flag_invalid STATUS_VAR);
2749 return float32_default_nan;
2751 if ( aExp == 0xFF ) {
2752 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2753 return a;
2756 aExp -= 0x7F;
2757 aSig |= 0x00800000;
2758 zSign = aExp < 0;
2759 zSig = aExp << 23;
2761 for (i = 1 << 22; i > 0; i >>= 1) {
2762 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2763 if ( aSig & 0x01000000 ) {
2764 aSig >>= 1;
2765 zSig |= i;
2769 if ( zSign )
2770 zSig = -zSig;
2772 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2775 /*----------------------------------------------------------------------------
2776 | Returns 1 if the single-precision floating-point value `a' is equal to
2777 | the corresponding value `b', and 0 otherwise. The invalid exception is
2778 | raised if either operand is a NaN. Otherwise, the comparison is performed
2779 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2780 *----------------------------------------------------------------------------*/
2782 int float32_eq( float32 a, float32 b STATUS_PARAM )
2784 uint32_t av, bv;
2785 a = float32_squash_input_denormal(a STATUS_VAR);
2786 b = float32_squash_input_denormal(b STATUS_VAR);
2788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2791 float_raise( float_flag_invalid STATUS_VAR);
2792 return 0;
2794 av = float32_val(a);
2795 bv = float32_val(b);
2796 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2799 /*----------------------------------------------------------------------------
2800 | Returns 1 if the single-precision floating-point value `a' is less than
2801 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2802 | exception is raised if either operand is a NaN. The comparison is performed
2803 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2804 *----------------------------------------------------------------------------*/
2806 int float32_le( float32 a, float32 b STATUS_PARAM )
2808 flag aSign, bSign;
2809 uint32_t av, bv;
2810 a = float32_squash_input_denormal(a STATUS_VAR);
2811 b = float32_squash_input_denormal(b STATUS_VAR);
2813 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2814 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2816 float_raise( float_flag_invalid STATUS_VAR);
2817 return 0;
2819 aSign = extractFloat32Sign( a );
2820 bSign = extractFloat32Sign( b );
2821 av = float32_val(a);
2822 bv = float32_val(b);
2823 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2824 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than
2830 | the corresponding value `b', and 0 otherwise. The invalid exception is
2831 | raised if either operand is a NaN. The comparison is performed according
2832 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2835 int float32_lt( float32 a, float32 b STATUS_PARAM )
2837 flag aSign, bSign;
2838 uint32_t av, bv;
2839 a = float32_squash_input_denormal(a STATUS_VAR);
2840 b = float32_squash_input_denormal(b STATUS_VAR);
2842 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2843 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2845 float_raise( float_flag_invalid STATUS_VAR);
2846 return 0;
2848 aSign = extractFloat32Sign( a );
2849 bSign = extractFloat32Sign( b );
2850 av = float32_val(a);
2851 bv = float32_val(b);
2852 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2853 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2857 /*----------------------------------------------------------------------------
2858 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2859 | be compared, and 0 otherwise. The invalid exception is raised if either
2860 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2861 | Standard for Binary Floating-Point Arithmetic.
2862 *----------------------------------------------------------------------------*/
2864 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2866 a = float32_squash_input_denormal(a STATUS_VAR);
2867 b = float32_squash_input_denormal(b STATUS_VAR);
2869 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2870 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2872 float_raise( float_flag_invalid STATUS_VAR);
2873 return 1;
2875 return 0;
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2881 | exception. The comparison is performed according to the IEC/IEEE Standard
2882 | for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2887 a = float32_squash_input_denormal(a STATUS_VAR);
2888 b = float32_squash_input_denormal(b STATUS_VAR);
2890 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2891 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2894 float_raise( float_flag_invalid STATUS_VAR);
2896 return 0;
2898 return ( float32_val(a) == float32_val(b) ) ||
2899 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than or
2904 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2905 | cause an exception. Otherwise, the comparison is performed according to the
2906 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2909 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2911 flag aSign, bSign;
2912 uint32_t av, bv;
2913 a = float32_squash_input_denormal(a STATUS_VAR);
2914 b = float32_squash_input_denormal(b STATUS_VAR);
2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2919 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2920 float_raise( float_flag_invalid STATUS_VAR);
2922 return 0;
2924 aSign = extractFloat32Sign( a );
2925 bSign = extractFloat32Sign( b );
2926 av = float32_val(a);
2927 bv = float32_val(b);
2928 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2929 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2933 /*----------------------------------------------------------------------------
2934 | Returns 1 if the single-precision floating-point value `a' is less than
2935 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2936 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2937 | Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2940 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2942 flag aSign, bSign;
2943 uint32_t av, bv;
2944 a = float32_squash_input_denormal(a STATUS_VAR);
2945 b = float32_squash_input_denormal(b STATUS_VAR);
2947 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2950 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2951 float_raise( float_flag_invalid STATUS_VAR);
2953 return 0;
2955 aSign = extractFloat32Sign( a );
2956 bSign = extractFloat32Sign( b );
2957 av = float32_val(a);
2958 bv = float32_val(b);
2959 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2960 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2964 /*----------------------------------------------------------------------------
2965 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2966 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2967 | comparison is performed according to the IEC/IEEE Standard for Binary
2968 | Floating-Point Arithmetic.
2969 *----------------------------------------------------------------------------*/
2971 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2973 a = float32_squash_input_denormal(a STATUS_VAR);
2974 b = float32_squash_input_denormal(b STATUS_VAR);
2976 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2977 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2979 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2980 float_raise( float_flag_invalid STATUS_VAR);
2982 return 1;
2984 return 0;
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 32-bit two's complement integer format. The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic---which means in particular that the conversion is rounded
2992 | according to the current rounding mode. If `a' is a NaN, the largest
2993 | positive integer is returned. Otherwise, if the conversion overflows, the
2994 | largest integer with the same sign as `a' is returned.
2995 *----------------------------------------------------------------------------*/
2997 int32 float64_to_int32( float64 a STATUS_PARAM )
2999 flag aSign;
3000 int_fast16_t aExp, shiftCount;
3001 uint64_t aSig;
3002 a = float64_squash_input_denormal(a STATUS_VAR);
3004 aSig = extractFloat64Frac( a );
3005 aExp = extractFloat64Exp( a );
3006 aSign = extractFloat64Sign( a );
3007 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3008 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009 shiftCount = 0x42C - aExp;
3010 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3011 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of converting the double-precision floating-point value
3017 | `a' to the 32-bit two's complement integer format. The conversion is
3018 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3019 | Arithmetic, except that the conversion is always rounded toward zero.
3020 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3021 | the conversion overflows, the largest integer with the same sign as `a' is
3022 | returned.
3023 *----------------------------------------------------------------------------*/
3025 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
3027 flag aSign;
3028 int_fast16_t aExp, shiftCount;
3029 uint64_t aSig, savedASig;
3030 int32_t z;
3031 a = float64_squash_input_denormal(a STATUS_VAR);
3033 aSig = extractFloat64Frac( a );
3034 aExp = extractFloat64Exp( a );
3035 aSign = extractFloat64Sign( a );
3036 if ( 0x41E < aExp ) {
3037 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3038 goto invalid;
3040 else if ( aExp < 0x3FF ) {
3041 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3042 return 0;
3044 aSig |= LIT64( 0x0010000000000000 );
3045 shiftCount = 0x433 - aExp;
3046 savedASig = aSig;
3047 aSig >>= shiftCount;
3048 z = aSig;
3049 if ( aSign ) z = - z;
3050 if ( ( z < 0 ) ^ aSign ) {
3051 invalid:
3052 float_raise( float_flag_invalid STATUS_VAR);
3053 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3055 if ( ( aSig<<shiftCount ) != savedASig ) {
3056 STATUS(float_exception_flags) |= float_flag_inexact;
3058 return z;
3062 /*----------------------------------------------------------------------------
3063 | Returns the result of converting the double-precision floating-point value
3064 | `a' to the 16-bit two's complement integer format. The conversion is
3065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3066 | Arithmetic, except that the conversion is always rounded toward zero.
3067 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3068 | the conversion overflows, the largest integer with the same sign as `a' is
3069 | returned.
3070 *----------------------------------------------------------------------------*/
3072 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
3074 flag aSign;
3075 int_fast16_t aExp, shiftCount;
3076 uint64_t aSig, savedASig;
3077 int32 z;
3079 aSig = extractFloat64Frac( a );
3080 aExp = extractFloat64Exp( a );
3081 aSign = extractFloat64Sign( a );
3082 if ( 0x40E < aExp ) {
3083 if ( ( aExp == 0x7FF ) && aSig ) {
3084 aSign = 0;
3086 goto invalid;
3088 else if ( aExp < 0x3FF ) {
3089 if ( aExp || aSig ) {
3090 STATUS(float_exception_flags) |= float_flag_inexact;
3092 return 0;
3094 aSig |= LIT64( 0x0010000000000000 );
3095 shiftCount = 0x433 - aExp;
3096 savedASig = aSig;
3097 aSig >>= shiftCount;
3098 z = aSig;
3099 if ( aSign ) {
3100 z = - z;
3102 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3103 invalid:
3104 float_raise( float_flag_invalid STATUS_VAR);
3105 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3107 if ( ( aSig<<shiftCount ) != savedASig ) {
3108 STATUS(float_exception_flags) |= float_flag_inexact;
3110 return z;
3113 /*----------------------------------------------------------------------------
3114 | Returns the result of converting the double-precision floating-point value
3115 | `a' to the 64-bit two's complement integer format. The conversion is
3116 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3117 | Arithmetic---which means in particular that the conversion is rounded
3118 | according to the current rounding mode. If `a' is a NaN, the largest
3119 | positive integer is returned. Otherwise, if the conversion overflows, the
3120 | largest integer with the same sign as `a' is returned.
3121 *----------------------------------------------------------------------------*/
3123 int64 float64_to_int64( float64 a STATUS_PARAM )
3125 flag aSign;
3126 int_fast16_t aExp, shiftCount;
3127 uint64_t aSig, aSigExtra;
3128 a = float64_squash_input_denormal(a STATUS_VAR);
3130 aSig = extractFloat64Frac( a );
3131 aExp = extractFloat64Exp( a );
3132 aSign = extractFloat64Sign( a );
3133 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3134 shiftCount = 0x433 - aExp;
3135 if ( shiftCount <= 0 ) {
3136 if ( 0x43E < aExp ) {
3137 float_raise( float_flag_invalid STATUS_VAR);
3138 if ( ! aSign
3139 || ( ( aExp == 0x7FF )
3140 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3142 return LIT64( 0x7FFFFFFFFFFFFFFF );
3144 return (int64_t) LIT64( 0x8000000000000000 );
3146 aSigExtra = 0;
3147 aSig <<= - shiftCount;
3149 else {
3150 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3152 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the double-precision floating-point value
3158 | `a' to the 64-bit two's complement integer format. The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 | Arithmetic, except that the conversion is always rounded toward zero.
3161 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3162 | the conversion overflows, the largest integer with the same sign as `a' is
3163 | returned.
3164 *----------------------------------------------------------------------------*/
3166 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3168 flag aSign;
3169 int_fast16_t aExp, shiftCount;
3170 uint64_t aSig;
3171 int64 z;
3172 a = float64_squash_input_denormal(a STATUS_VAR);
3174 aSig = extractFloat64Frac( a );
3175 aExp = extractFloat64Exp( a );
3176 aSign = extractFloat64Sign( a );
3177 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3178 shiftCount = aExp - 0x433;
3179 if ( 0 <= shiftCount ) {
3180 if ( 0x43E <= aExp ) {
3181 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3182 float_raise( float_flag_invalid STATUS_VAR);
3183 if ( ! aSign
3184 || ( ( aExp == 0x7FF )
3185 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3187 return LIT64( 0x7FFFFFFFFFFFFFFF );
3190 return (int64_t) LIT64( 0x8000000000000000 );
3192 z = aSig<<shiftCount;
3194 else {
3195 if ( aExp < 0x3FE ) {
3196 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3197 return 0;
3199 z = aSig>>( - shiftCount );
3200 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3201 STATUS(float_exception_flags) |= float_flag_inexact;
3204 if ( aSign ) z = - z;
3205 return z;
3209 /*----------------------------------------------------------------------------
3210 | Returns the result of converting the double-precision floating-point value
3211 | `a' to the single-precision floating-point format. The conversion is
3212 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3213 | Arithmetic.
3214 *----------------------------------------------------------------------------*/
3216 float32 float64_to_float32( float64 a STATUS_PARAM )
3218 flag aSign;
3219 int_fast16_t aExp;
3220 uint64_t aSig;
3221 uint32_t zSig;
3222 a = float64_squash_input_denormal(a STATUS_VAR);
3224 aSig = extractFloat64Frac( a );
3225 aExp = extractFloat64Exp( a );
3226 aSign = extractFloat64Sign( a );
3227 if ( aExp == 0x7FF ) {
3228 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3229 return packFloat32( aSign, 0xFF, 0 );
3231 shift64RightJamming( aSig, 22, &aSig );
3232 zSig = aSig;
3233 if ( aExp || zSig ) {
3234 zSig |= 0x40000000;
3235 aExp -= 0x381;
3237 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3242 /*----------------------------------------------------------------------------
3243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3244 | half-precision floating-point value, returning the result. After being
3245 | shifted into the proper positions, the three fields are simply added
3246 | together to form the result. This means that any integer portion of `zSig'
3247 | will be added into the exponent. Since a properly normalized significand
3248 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3249 | than the desired result exponent whenever `zSig' is a complete, normalized
3250 | significand.
3251 *----------------------------------------------------------------------------*/
3252 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3254 return make_float16(
3255 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3258 /*----------------------------------------------------------------------------
3259 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3260 | and significand `zSig', and returns the proper half-precision floating-
3261 | point value corresponding to the abstract input. Ordinarily, the abstract
3262 | value is simply rounded and packed into the half-precision format, with
3263 | the inexact exception raised if the abstract input cannot be represented
3264 | exactly. However, if the abstract value is too large, the overflow and
3265 | inexact exceptions are raised and an infinity or maximal finite value is
3266 | returned. If the abstract value is too small, the input value is rounded to
3267 | a subnormal number, and the underflow and inexact exceptions are raised if
3268 | the abstract input cannot be represented exactly as a subnormal half-
3269 | precision floating-point number.
3270 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3271 | ARM-style "alternative representation", which omits the NaN and Inf
3272 | encodings in order to raise the maximum representable exponent by one.
3273 | The input significand `zSig' has its binary point between bits 22
3274 | and 23, which is 13 bits to the left of the usual location. This shifted
3275 | significand must be normalized or smaller. If `zSig' is not normalized,
3276 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3277 | and it must not require rounding. In the usual case that `zSig' is
3278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3279 | Note the slightly odd position of the binary point in zSig compared with the
3280 | other roundAndPackFloat functions. This should probably be fixed if we
3281 | need to implement more float16 routines than just conversion.
3282 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3283 | Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3286 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3287 uint32_t zSig, flag ieee STATUS_PARAM)
3289 int maxexp = ieee ? 29 : 30;
3290 uint32_t mask;
3291 uint32_t increment;
3292 bool rounding_bumps_exp;
3293 bool is_tiny = false;
3295 /* Calculate the mask of bits of the mantissa which are not
3296 * representable in half-precision and will be lost.
3298 if (zExp < 1) {
3299 /* Will be denormal in halfprec */
3300 mask = 0x00ffffff;
3301 if (zExp >= -11) {
3302 mask >>= 11 + zExp;
3304 } else {
3305 /* Normal number in halfprec */
3306 mask = 0x00001fff;
3309 switch (STATUS(float_rounding_mode)) {
3310 case float_round_nearest_even:
3311 increment = (mask + 1) >> 1;
3312 if ((zSig & mask) == increment) {
3313 increment = zSig & (increment << 1);
3315 break;
3316 case float_round_ties_away:
3317 increment = (mask + 1) >> 1;
3318 break;
3319 case float_round_up:
3320 increment = zSign ? 0 : mask;
3321 break;
3322 case float_round_down:
3323 increment = zSign ? mask : 0;
3324 break;
3325 default: /* round_to_zero */
3326 increment = 0;
3327 break;
3330 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3332 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3333 if (ieee) {
3334 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3335 return packFloat16(zSign, 0x1f, 0);
3336 } else {
3337 float_raise(float_flag_invalid STATUS_VAR);
3338 return packFloat16(zSign, 0x1f, 0x3ff);
3342 if (zExp < 0) {
3343 /* Note that flush-to-zero does not affect half-precision results */
3344 is_tiny =
3345 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3346 || (zExp < -1)
3347 || (!rounding_bumps_exp);
3349 if (zSig & mask) {
3350 float_raise(float_flag_inexact STATUS_VAR);
3351 if (is_tiny) {
3352 float_raise(float_flag_underflow STATUS_VAR);
3356 zSig += increment;
3357 if (rounding_bumps_exp) {
3358 zSig >>= 1;
3359 zExp++;
3362 if (zExp < -10) {
3363 return packFloat16(zSign, 0, 0);
3365 if (zExp < 0) {
3366 zSig >>= -zExp;
3367 zExp = 0;
3369 return packFloat16(zSign, zExp, zSig >> 13);
3372 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3373 uint32_t *zSigPtr)
3375 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3376 *zSigPtr = aSig << shiftCount;
3377 *zExpPtr = 1 - shiftCount;
3380 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3381 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3383 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3385 flag aSign;
3386 int_fast16_t aExp;
3387 uint32_t aSig;
3389 aSign = extractFloat16Sign(a);
3390 aExp = extractFloat16Exp(a);
3391 aSig = extractFloat16Frac(a);
3393 if (aExp == 0x1f && ieee) {
3394 if (aSig) {
3395 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3397 return packFloat32(aSign, 0xff, 0);
3399 if (aExp == 0) {
3400 if (aSig == 0) {
3401 return packFloat32(aSign, 0, 0);
3404 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3405 aExp--;
3407 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3410 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3412 flag aSign;
3413 int_fast16_t aExp;
3414 uint32_t aSig;
3416 a = float32_squash_input_denormal(a STATUS_VAR);
3418 aSig = extractFloat32Frac( a );
3419 aExp = extractFloat32Exp( a );
3420 aSign = extractFloat32Sign( a );
3421 if ( aExp == 0xFF ) {
3422 if (aSig) {
3423 /* Input is a NaN */
3424 if (!ieee) {
3425 float_raise(float_flag_invalid STATUS_VAR);
3426 return packFloat16(aSign, 0, 0);
3428 return commonNaNToFloat16(
3429 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3431 /* Infinity */
3432 if (!ieee) {
3433 float_raise(float_flag_invalid STATUS_VAR);
3434 return packFloat16(aSign, 0x1f, 0x3ff);
3436 return packFloat16(aSign, 0x1f, 0);
3438 if (aExp == 0 && aSig == 0) {
3439 return packFloat16(aSign, 0, 0);
3441 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3442 * even if the input is denormal; however this is harmless because
3443 * the largest possible single-precision denormal is still smaller
3444 * than the smallest representable half-precision denormal, and so we
3445 * will end up ignoring aSig and returning via the "always return zero"
3446 * codepath.
3448 aSig |= 0x00800000;
3449 aExp -= 0x71;
3451 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3454 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3456 flag aSign;
3457 int_fast16_t aExp;
3458 uint32_t aSig;
3460 aSign = extractFloat16Sign(a);
3461 aExp = extractFloat16Exp(a);
3462 aSig = extractFloat16Frac(a);
3464 if (aExp == 0x1f && ieee) {
3465 if (aSig) {
3466 return commonNaNToFloat64(
3467 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3469 return packFloat64(aSign, 0x7ff, 0);
3471 if (aExp == 0) {
3472 if (aSig == 0) {
3473 return packFloat64(aSign, 0, 0);
3476 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3477 aExp--;
3479 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3482 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3484 flag aSign;
3485 int_fast16_t aExp;
3486 uint64_t aSig;
3487 uint32_t zSig;
3489 a = float64_squash_input_denormal(a STATUS_VAR);
3491 aSig = extractFloat64Frac(a);
3492 aExp = extractFloat64Exp(a);
3493 aSign = extractFloat64Sign(a);
3494 if (aExp == 0x7FF) {
3495 if (aSig) {
3496 /* Input is a NaN */
3497 if (!ieee) {
3498 float_raise(float_flag_invalid STATUS_VAR);
3499 return packFloat16(aSign, 0, 0);
3501 return commonNaNToFloat16(
3502 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3504 /* Infinity */
3505 if (!ieee) {
3506 float_raise(float_flag_invalid STATUS_VAR);
3507 return packFloat16(aSign, 0x1f, 0x3ff);
3509 return packFloat16(aSign, 0x1f, 0);
3511 shift64RightJamming(aSig, 29, &aSig);
3512 zSig = aSig;
3513 if (aExp == 0 && zSig == 0) {
3514 return packFloat16(aSign, 0, 0);
3516 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3517 * even if the input is denormal; however this is harmless because
3518 * the largest possible single-precision denormal is still smaller
3519 * than the smallest representable half-precision denormal, and so we
3520 * will end up ignoring aSig and returning via the "always return zero"
3521 * codepath.
3523 zSig |= 0x00800000;
3524 aExp -= 0x3F1;
3526 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3529 /*----------------------------------------------------------------------------
3530 | Returns the result of converting the double-precision floating-point value
3531 | `a' to the extended double-precision floating-point format. The conversion
3532 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3533 | Arithmetic.
3534 *----------------------------------------------------------------------------*/
3536 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3538 flag aSign;
3539 int_fast16_t aExp;
3540 uint64_t aSig;
3542 a = float64_squash_input_denormal(a STATUS_VAR);
3543 aSig = extractFloat64Frac( a );
3544 aExp = extractFloat64Exp( a );
3545 aSign = extractFloat64Sign( a );
3546 if ( aExp == 0x7FF ) {
3547 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3548 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3550 if ( aExp == 0 ) {
3551 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3552 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3554 return
3555 packFloatx80(
3556 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3560 /*----------------------------------------------------------------------------
3561 | Returns the result of converting the double-precision floating-point value
3562 | `a' to the quadruple-precision floating-point format. The conversion is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3564 | Arithmetic.
3565 *----------------------------------------------------------------------------*/
3567 float128 float64_to_float128( float64 a STATUS_PARAM )
3569 flag aSign;
3570 int_fast16_t aExp;
3571 uint64_t aSig, zSig0, zSig1;
3573 a = float64_squash_input_denormal(a STATUS_VAR);
3574 aSig = extractFloat64Frac( a );
3575 aExp = extractFloat64Exp( a );
3576 aSign = extractFloat64Sign( a );
3577 if ( aExp == 0x7FF ) {
3578 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3579 return packFloat128( aSign, 0x7FFF, 0, 0 );
3581 if ( aExp == 0 ) {
3582 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3583 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3584 --aExp;
3586 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3587 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3591 /*----------------------------------------------------------------------------
3592 | Rounds the double-precision floating-point value `a' to an integer, and
3593 | returns the result as a double-precision floating-point value. The
3594 | operation is performed according to the IEC/IEEE Standard for Binary
3595 | Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3598 float64 float64_round_to_int( float64 a STATUS_PARAM )
3600 flag aSign;
3601 int_fast16_t aExp;
3602 uint64_t lastBitMask, roundBitsMask;
3603 uint64_t z;
3604 a = float64_squash_input_denormal(a STATUS_VAR);
3606 aExp = extractFloat64Exp( a );
3607 if ( 0x433 <= aExp ) {
3608 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3609 return propagateFloat64NaN( a, a STATUS_VAR );
3611 return a;
3613 if ( aExp < 0x3FF ) {
3614 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3615 STATUS(float_exception_flags) |= float_flag_inexact;
3616 aSign = extractFloat64Sign( a );
3617 switch ( STATUS(float_rounding_mode) ) {
3618 case float_round_nearest_even:
3619 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3620 return packFloat64( aSign, 0x3FF, 0 );
3622 break;
3623 case float_round_ties_away:
3624 if (aExp == 0x3FE) {
3625 return packFloat64(aSign, 0x3ff, 0);
3627 break;
3628 case float_round_down:
3629 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3630 case float_round_up:
3631 return make_float64(
3632 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3634 return packFloat64( aSign, 0, 0 );
3636 lastBitMask = 1;
3637 lastBitMask <<= 0x433 - aExp;
3638 roundBitsMask = lastBitMask - 1;
3639 z = float64_val(a);
3640 switch (STATUS(float_rounding_mode)) {
3641 case float_round_nearest_even:
3642 z += lastBitMask >> 1;
3643 if ((z & roundBitsMask) == 0) {
3644 z &= ~lastBitMask;
3646 break;
3647 case float_round_ties_away:
3648 z += lastBitMask >> 1;
3649 break;
3650 case float_round_to_zero:
3651 break;
3652 case float_round_up:
3653 if (!extractFloat64Sign(make_float64(z))) {
3654 z += roundBitsMask;
3656 break;
3657 case float_round_down:
3658 if (extractFloat64Sign(make_float64(z))) {
3659 z += roundBitsMask;
3661 break;
3662 default:
3663 abort();
3665 z &= ~ roundBitsMask;
3666 if ( z != float64_val(a) )
3667 STATUS(float_exception_flags) |= float_flag_inexact;
3668 return make_float64(z);
3672 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3674 int oldmode;
3675 float64 res;
3676 oldmode = STATUS(float_rounding_mode);
3677 STATUS(float_rounding_mode) = float_round_to_zero;
3678 res = float64_round_to_int(a STATUS_VAR);
3679 STATUS(float_rounding_mode) = oldmode;
3680 return res;
3683 /*----------------------------------------------------------------------------
3684 | Returns the result of adding the absolute values of the double-precision
3685 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3686 | before being returned. `zSign' is ignored if the result is a NaN.
3687 | The addition is performed according to the IEC/IEEE Standard for Binary
3688 | Floating-Point Arithmetic.
3689 *----------------------------------------------------------------------------*/
3691 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3693 int_fast16_t aExp, bExp, zExp;
3694 uint64_t aSig, bSig, zSig;
3695 int_fast16_t expDiff;
3697 aSig = extractFloat64Frac( a );
3698 aExp = extractFloat64Exp( a );
3699 bSig = extractFloat64Frac( b );
3700 bExp = extractFloat64Exp( b );
3701 expDiff = aExp - bExp;
3702 aSig <<= 9;
3703 bSig <<= 9;
3704 if ( 0 < expDiff ) {
3705 if ( aExp == 0x7FF ) {
3706 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3707 return a;
3709 if ( bExp == 0 ) {
3710 --expDiff;
3712 else {
3713 bSig |= LIT64( 0x2000000000000000 );
3715 shift64RightJamming( bSig, expDiff, &bSig );
3716 zExp = aExp;
3718 else if ( expDiff < 0 ) {
3719 if ( bExp == 0x7FF ) {
3720 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3721 return packFloat64( zSign, 0x7FF, 0 );
3723 if ( aExp == 0 ) {
3724 ++expDiff;
3726 else {
3727 aSig |= LIT64( 0x2000000000000000 );
3729 shift64RightJamming( aSig, - expDiff, &aSig );
3730 zExp = bExp;
3732 else {
3733 if ( aExp == 0x7FF ) {
3734 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3735 return a;
3737 if ( aExp == 0 ) {
3738 if (STATUS(flush_to_zero)) {
3739 if (aSig | bSig) {
3740 float_raise(float_flag_output_denormal STATUS_VAR);
3742 return packFloat64(zSign, 0, 0);
3744 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3746 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3747 zExp = aExp;
3748 goto roundAndPack;
3750 aSig |= LIT64( 0x2000000000000000 );
3751 zSig = ( aSig + bSig )<<1;
3752 --zExp;
3753 if ( (int64_t) zSig < 0 ) {
3754 zSig = aSig + bSig;
3755 ++zExp;
3757 roundAndPack:
3758 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3762 /*----------------------------------------------------------------------------
3763 | Returns the result of subtracting the absolute values of the double-
3764 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3765 | difference is negated before being returned. `zSign' is ignored if the
3766 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3767 | Standard for Binary Floating-Point Arithmetic.
3768 *----------------------------------------------------------------------------*/
3770 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3772 int_fast16_t aExp, bExp, zExp;
3773 uint64_t aSig, bSig, zSig;
3774 int_fast16_t expDiff;
3776 aSig = extractFloat64Frac( a );
3777 aExp = extractFloat64Exp( a );
3778 bSig = extractFloat64Frac( b );
3779 bExp = extractFloat64Exp( b );
3780 expDiff = aExp - bExp;
3781 aSig <<= 10;
3782 bSig <<= 10;
3783 if ( 0 < expDiff ) goto aExpBigger;
3784 if ( expDiff < 0 ) goto bExpBigger;
3785 if ( aExp == 0x7FF ) {
3786 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3787 float_raise( float_flag_invalid STATUS_VAR);
3788 return float64_default_nan;
3790 if ( aExp == 0 ) {
3791 aExp = 1;
3792 bExp = 1;
3794 if ( bSig < aSig ) goto aBigger;
3795 if ( aSig < bSig ) goto bBigger;
3796 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3797 bExpBigger:
3798 if ( bExp == 0x7FF ) {
3799 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3800 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3802 if ( aExp == 0 ) {
3803 ++expDiff;
3805 else {
3806 aSig |= LIT64( 0x4000000000000000 );
3808 shift64RightJamming( aSig, - expDiff, &aSig );
3809 bSig |= LIT64( 0x4000000000000000 );
3810 bBigger:
3811 zSig = bSig - aSig;
3812 zExp = bExp;
3813 zSign ^= 1;
3814 goto normalizeRoundAndPack;
3815 aExpBigger:
3816 if ( aExp == 0x7FF ) {
3817 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3818 return a;
3820 if ( bExp == 0 ) {
3821 --expDiff;
3823 else {
3824 bSig |= LIT64( 0x4000000000000000 );
3826 shift64RightJamming( bSig, expDiff, &bSig );
3827 aSig |= LIT64( 0x4000000000000000 );
3828 aBigger:
3829 zSig = aSig - bSig;
3830 zExp = aExp;
3831 normalizeRoundAndPack:
3832 --zExp;
3833 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3837 /*----------------------------------------------------------------------------
3838 | Returns the result of adding the double-precision floating-point values `a'
3839 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3840 | Binary Floating-Point Arithmetic.
3841 *----------------------------------------------------------------------------*/
3843 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3845 flag aSign, bSign;
3846 a = float64_squash_input_denormal(a STATUS_VAR);
3847 b = float64_squash_input_denormal(b STATUS_VAR);
3849 aSign = extractFloat64Sign( a );
3850 bSign = extractFloat64Sign( b );
3851 if ( aSign == bSign ) {
3852 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3854 else {
3855 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the double-precision floating-point values
3862 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3863 | for Binary Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3866 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3868 flag aSign, bSign;
3869 a = float64_squash_input_denormal(a STATUS_VAR);
3870 b = float64_squash_input_denormal(b STATUS_VAR);
3872 aSign = extractFloat64Sign( a );
3873 bSign = extractFloat64Sign( b );
3874 if ( aSign == bSign ) {
3875 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3877 else {
3878 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3883 /*----------------------------------------------------------------------------
3884 | Returns the result of multiplying the double-precision floating-point values
3885 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3886 | for Binary Floating-Point Arithmetic.
3887 *----------------------------------------------------------------------------*/
3889 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3891 flag aSign, bSign, zSign;
3892 int_fast16_t aExp, bExp, zExp;
3893 uint64_t aSig, bSig, zSig0, zSig1;
3895 a = float64_squash_input_denormal(a STATUS_VAR);
3896 b = float64_squash_input_denormal(b STATUS_VAR);
3898 aSig = extractFloat64Frac( a );
3899 aExp = extractFloat64Exp( a );
3900 aSign = extractFloat64Sign( a );
3901 bSig = extractFloat64Frac( b );
3902 bExp = extractFloat64Exp( b );
3903 bSign = extractFloat64Sign( b );
3904 zSign = aSign ^ bSign;
3905 if ( aExp == 0x7FF ) {
3906 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3907 return propagateFloat64NaN( a, b STATUS_VAR );
3909 if ( ( bExp | bSig ) == 0 ) {
3910 float_raise( float_flag_invalid STATUS_VAR);
3911 return float64_default_nan;
3913 return packFloat64( zSign, 0x7FF, 0 );
3915 if ( bExp == 0x7FF ) {
3916 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3917 if ( ( aExp | aSig ) == 0 ) {
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 return float64_default_nan;
3921 return packFloat64( zSign, 0x7FF, 0 );
3923 if ( aExp == 0 ) {
3924 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3925 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3927 if ( bExp == 0 ) {
3928 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3929 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3931 zExp = aExp + bExp - 0x3FF;
3932 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3933 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3934 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935 zSig0 |= ( zSig1 != 0 );
3936 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3937 zSig0 <<= 1;
3938 --zExp;
3940 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3944 /*----------------------------------------------------------------------------
3945 | Returns the result of dividing the double-precision floating-point value `a'
3946 | by the corresponding value `b'. The operation is performed according to
3947 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948 *----------------------------------------------------------------------------*/
3950 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3952 flag aSign, bSign, zSign;
3953 int_fast16_t aExp, bExp, zExp;
3954 uint64_t aSig, bSig, zSig;
3955 uint64_t rem0, rem1;
3956 uint64_t term0, term1;
3957 a = float64_squash_input_denormal(a STATUS_VAR);
3958 b = float64_squash_input_denormal(b STATUS_VAR);
3960 aSig = extractFloat64Frac( a );
3961 aExp = extractFloat64Exp( a );
3962 aSign = extractFloat64Sign( a );
3963 bSig = extractFloat64Frac( b );
3964 bExp = extractFloat64Exp( b );
3965 bSign = extractFloat64Sign( b );
3966 zSign = aSign ^ bSign;
3967 if ( aExp == 0x7FF ) {
3968 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3969 if ( bExp == 0x7FF ) {
3970 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3971 float_raise( float_flag_invalid STATUS_VAR);
3972 return float64_default_nan;
3974 return packFloat64( zSign, 0x7FF, 0 );
3976 if ( bExp == 0x7FF ) {
3977 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3978 return packFloat64( zSign, 0, 0 );
3980 if ( bExp == 0 ) {
3981 if ( bSig == 0 ) {
3982 if ( ( aExp | aSig ) == 0 ) {
3983 float_raise( float_flag_invalid STATUS_VAR);
3984 return float64_default_nan;
3986 float_raise( float_flag_divbyzero STATUS_VAR);
3987 return packFloat64( zSign, 0x7FF, 0 );
3989 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3991 if ( aExp == 0 ) {
3992 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3993 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3995 zExp = aExp - bExp + 0x3FD;
3996 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3997 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3998 if ( bSig <= ( aSig + aSig ) ) {
3999 aSig >>= 1;
4000 ++zExp;
4002 zSig = estimateDiv128To64( aSig, 0, bSig );
4003 if ( ( zSig & 0x1FF ) <= 2 ) {
4004 mul64To128( bSig, zSig, &term0, &term1 );
4005 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4006 while ( (int64_t) rem0 < 0 ) {
4007 --zSig;
4008 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4010 zSig |= ( rem1 != 0 );
4012 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
4016 /*----------------------------------------------------------------------------
4017 | Returns the remainder of the double-precision floating-point value `a'
4018 | with respect to the corresponding value `b'. The operation is performed
4019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4020 *----------------------------------------------------------------------------*/
4022 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
4024 flag aSign, zSign;
4025 int_fast16_t aExp, bExp, expDiff;
4026 uint64_t aSig, bSig;
4027 uint64_t q, alternateASig;
4028 int64_t sigMean;
4030 a = float64_squash_input_denormal(a STATUS_VAR);
4031 b = float64_squash_input_denormal(b STATUS_VAR);
4032 aSig = extractFloat64Frac( a );
4033 aExp = extractFloat64Exp( a );
4034 aSign = extractFloat64Sign( a );
4035 bSig = extractFloat64Frac( b );
4036 bExp = extractFloat64Exp( b );
4037 if ( aExp == 0x7FF ) {
4038 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4039 return propagateFloat64NaN( a, b STATUS_VAR );
4041 float_raise( float_flag_invalid STATUS_VAR);
4042 return float64_default_nan;
4044 if ( bExp == 0x7FF ) {
4045 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4046 return a;
4048 if ( bExp == 0 ) {
4049 if ( bSig == 0 ) {
4050 float_raise( float_flag_invalid STATUS_VAR);
4051 return float64_default_nan;
4053 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4055 if ( aExp == 0 ) {
4056 if ( aSig == 0 ) return a;
4057 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4059 expDiff = aExp - bExp;
4060 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4061 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4062 if ( expDiff < 0 ) {
4063 if ( expDiff < -1 ) return a;
4064 aSig >>= 1;
4066 q = ( bSig <= aSig );
4067 if ( q ) aSig -= bSig;
4068 expDiff -= 64;
4069 while ( 0 < expDiff ) {
4070 q = estimateDiv128To64( aSig, 0, bSig );
4071 q = ( 2 < q ) ? q - 2 : 0;
4072 aSig = - ( ( bSig>>2 ) * q );
4073 expDiff -= 62;
4075 expDiff += 64;
4076 if ( 0 < expDiff ) {
4077 q = estimateDiv128To64( aSig, 0, bSig );
4078 q = ( 2 < q ) ? q - 2 : 0;
4079 q >>= 64 - expDiff;
4080 bSig >>= 2;
4081 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4083 else {
4084 aSig >>= 2;
4085 bSig >>= 2;
4087 do {
4088 alternateASig = aSig;
4089 ++q;
4090 aSig -= bSig;
4091 } while ( 0 <= (int64_t) aSig );
4092 sigMean = aSig + alternateASig;
4093 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4094 aSig = alternateASig;
4096 zSign = ( (int64_t) aSig < 0 );
4097 if ( zSign ) aSig = - aSig;
4098 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4102 /*----------------------------------------------------------------------------
4103 | Returns the result of multiplying the double-precision floating-point values
4104 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4105 | multiplication. The operation is performed according to the IEC/IEEE
4106 | Standard for Binary Floating-Point Arithmetic 754-2008.
4107 | The flags argument allows the caller to select negation of the
4108 | addend, the intermediate product, or the final result. (The difference
4109 | between this and having the caller do a separate negation is that negating
4110 | externally will flip the sign bit on NaNs.)
4111 *----------------------------------------------------------------------------*/
4113 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4115 flag aSign, bSign, cSign, zSign;
4116 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4117 uint64_t aSig, bSig, cSig;
4118 flag pInf, pZero, pSign;
4119 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4120 int shiftcount;
4121 flag signflip, infzero;
4123 a = float64_squash_input_denormal(a STATUS_VAR);
4124 b = float64_squash_input_denormal(b STATUS_VAR);
4125 c = float64_squash_input_denormal(c STATUS_VAR);
4126 aSig = extractFloat64Frac(a);
4127 aExp = extractFloat64Exp(a);
4128 aSign = extractFloat64Sign(a);
4129 bSig = extractFloat64Frac(b);
4130 bExp = extractFloat64Exp(b);
4131 bSign = extractFloat64Sign(b);
4132 cSig = extractFloat64Frac(c);
4133 cExp = extractFloat64Exp(c);
4134 cSign = extractFloat64Sign(c);
4136 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4137 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4139 /* It is implementation-defined whether the cases of (0,inf,qnan)
4140 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4141 * they return if they do), so we have to hand this information
4142 * off to the target-specific pick-a-NaN routine.
4144 if (((aExp == 0x7ff) && aSig) ||
4145 ((bExp == 0x7ff) && bSig) ||
4146 ((cExp == 0x7ff) && cSig)) {
4147 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4150 if (infzero) {
4151 float_raise(float_flag_invalid STATUS_VAR);
4152 return float64_default_nan;
4155 if (flags & float_muladd_negate_c) {
4156 cSign ^= 1;
4159 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4161 /* Work out the sign and type of the product */
4162 pSign = aSign ^ bSign;
4163 if (flags & float_muladd_negate_product) {
4164 pSign ^= 1;
4166 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4167 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4169 if (cExp == 0x7ff) {
4170 if (pInf && (pSign ^ cSign)) {
4171 /* addition of opposite-signed infinities => InvalidOperation */
4172 float_raise(float_flag_invalid STATUS_VAR);
4173 return float64_default_nan;
4175 /* Otherwise generate an infinity of the same sign */
4176 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4179 if (pInf) {
4180 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4183 if (pZero) {
4184 if (cExp == 0) {
4185 if (cSig == 0) {
4186 /* Adding two exact zeroes */
4187 if (pSign == cSign) {
4188 zSign = pSign;
4189 } else if (STATUS(float_rounding_mode) == float_round_down) {
4190 zSign = 1;
4191 } else {
4192 zSign = 0;
4194 return packFloat64(zSign ^ signflip, 0, 0);
4196 /* Exact zero plus a denorm */
4197 if (STATUS(flush_to_zero)) {
4198 float_raise(float_flag_output_denormal STATUS_VAR);
4199 return packFloat64(cSign ^ signflip, 0, 0);
4202 /* Zero plus something non-zero : just return the something */
4203 if (flags & float_muladd_halve_result) {
4204 if (cExp == 0) {
4205 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4207 /* Subtract one to halve, and one again because roundAndPackFloat64
4208 * wants one less than the true exponent.
4210 cExp -= 2;
4211 cSig = (cSig | 0x0010000000000000ULL) << 10;
4212 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4214 return packFloat64(cSign ^ signflip, cExp, cSig);
4217 if (aExp == 0) {
4218 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4220 if (bExp == 0) {
4221 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4224 /* Calculate the actual result a * b + c */
4226 /* Multiply first; this is easy. */
4227 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4228 * because we want the true exponent, not the "one-less-than"
4229 * flavour that roundAndPackFloat64() takes.
4231 pExp = aExp + bExp - 0x3fe;
4232 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4233 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4234 mul64To128(aSig, bSig, &pSig0, &pSig1);
4235 if ((int64_t)(pSig0 << 1) >= 0) {
4236 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4237 pExp--;
4240 zSign = pSign ^ signflip;
4242 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4243 * bit in position 126.
4245 if (cExp == 0) {
4246 if (!cSig) {
4247 /* Throw out the special case of c being an exact zero now */
4248 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4249 if (flags & float_muladd_halve_result) {
4250 pExp--;
4252 return roundAndPackFloat64(zSign, pExp - 1,
4253 pSig1 STATUS_VAR);
4255 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4258 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4259 * significand of the addend, with the explicit bit in position 126.
4261 cSig0 = cSig << (126 - 64 - 52);
4262 cSig1 = 0;
4263 cSig0 |= LIT64(0x4000000000000000);
4264 expDiff = pExp - cExp;
4266 if (pSign == cSign) {
4267 /* Addition */
4268 if (expDiff > 0) {
4269 /* scale c to match p */
4270 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4271 zExp = pExp;
4272 } else if (expDiff < 0) {
4273 /* scale p to match c */
4274 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4275 zExp = cExp;
4276 } else {
4277 /* no scaling needed */
4278 zExp = cExp;
4280 /* Add significands and make sure explicit bit ends up in posn 126 */
4281 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4282 if ((int64_t)zSig0 < 0) {
4283 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4284 } else {
4285 zExp--;
4287 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4288 if (flags & float_muladd_halve_result) {
4289 zExp--;
4291 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4292 } else {
4293 /* Subtraction */
4294 if (expDiff > 0) {
4295 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4296 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4297 zExp = pExp;
4298 } else if (expDiff < 0) {
4299 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4300 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4301 zExp = cExp;
4302 zSign ^= 1;
4303 } else {
4304 zExp = pExp;
4305 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4306 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4307 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4308 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4309 zSign ^= 1;
4310 } else {
4311 /* Exact zero */
4312 zSign = signflip;
4313 if (STATUS(float_rounding_mode) == float_round_down) {
4314 zSign ^= 1;
4316 return packFloat64(zSign, 0, 0);
4319 --zExp;
4320 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4321 * starting with the significand in a pair of uint64_t.
4323 if (zSig0) {
4324 shiftcount = countLeadingZeros64(zSig0) - 1;
4325 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4326 if (zSig1) {
4327 zSig0 |= 1;
4329 zExp -= shiftcount;
4330 } else {
4331 shiftcount = countLeadingZeros64(zSig1);
4332 if (shiftcount == 0) {
4333 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4334 zExp -= 63;
4335 } else {
4336 shiftcount--;
4337 zSig0 = zSig1 << shiftcount;
4338 zExp -= (shiftcount + 64);
4341 if (flags & float_muladd_halve_result) {
4342 zExp--;
4344 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4348 /*----------------------------------------------------------------------------
4349 | Returns the square root of the double-precision floating-point value `a'.
4350 | The operation is performed according to the IEC/IEEE Standard for Binary
4351 | Floating-Point Arithmetic.
4352 *----------------------------------------------------------------------------*/
4354 float64 float64_sqrt( float64 a STATUS_PARAM )
4356 flag aSign;
4357 int_fast16_t aExp, zExp;
4358 uint64_t aSig, zSig, doubleZSig;
4359 uint64_t rem0, rem1, term0, term1;
4360 a = float64_squash_input_denormal(a STATUS_VAR);
4362 aSig = extractFloat64Frac( a );
4363 aExp = extractFloat64Exp( a );
4364 aSign = extractFloat64Sign( a );
4365 if ( aExp == 0x7FF ) {
4366 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4367 if ( ! aSign ) return a;
4368 float_raise( float_flag_invalid STATUS_VAR);
4369 return float64_default_nan;
4371 if ( aSign ) {
4372 if ( ( aExp | aSig ) == 0 ) return a;
4373 float_raise( float_flag_invalid STATUS_VAR);
4374 return float64_default_nan;
4376 if ( aExp == 0 ) {
4377 if ( aSig == 0 ) return float64_zero;
4378 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4380 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4381 aSig |= LIT64( 0x0010000000000000 );
4382 zSig = estimateSqrt32( aExp, aSig>>21 );
4383 aSig <<= 9 - ( aExp & 1 );
4384 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4385 if ( ( zSig & 0x1FF ) <= 5 ) {
4386 doubleZSig = zSig<<1;
4387 mul64To128( zSig, zSig, &term0, &term1 );
4388 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4389 while ( (int64_t) rem0 < 0 ) {
4390 --zSig;
4391 doubleZSig -= 2;
4392 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4394 zSig |= ( ( rem0 | rem1 ) != 0 );
4396 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4400 /*----------------------------------------------------------------------------
4401 | Returns the binary log of the double-precision floating-point value `a'.
4402 | The operation is performed according to the IEC/IEEE Standard for Binary
4403 | Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 float64 float64_log2( float64 a STATUS_PARAM )
4407 flag aSign, zSign;
4408 int_fast16_t aExp;
4409 uint64_t aSig, aSig0, aSig1, zSig, i;
4410 a = float64_squash_input_denormal(a STATUS_VAR);
4412 aSig = extractFloat64Frac( a );
4413 aExp = extractFloat64Exp( a );
4414 aSign = extractFloat64Sign( a );
4416 if ( aExp == 0 ) {
4417 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4418 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4420 if ( aSign ) {
4421 float_raise( float_flag_invalid STATUS_VAR);
4422 return float64_default_nan;
4424 if ( aExp == 0x7FF ) {
4425 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4426 return a;
4429 aExp -= 0x3FF;
4430 aSig |= LIT64( 0x0010000000000000 );
4431 zSign = aExp < 0;
4432 zSig = (uint64_t)aExp << 52;
4433 for (i = 1LL << 51; i > 0; i >>= 1) {
4434 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4435 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4436 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4437 aSig >>= 1;
4438 zSig |= i;
4442 if ( zSign )
4443 zSig = -zSig;
4444 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point value `a' is equal to the
4449 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4450 | if either operand is a NaN. Otherwise, the comparison is performed
4451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4454 int float64_eq( float64 a, float64 b STATUS_PARAM )
4456 uint64_t av, bv;
4457 a = float64_squash_input_denormal(a STATUS_VAR);
4458 b = float64_squash_input_denormal(b STATUS_VAR);
4460 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4461 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4463 float_raise( float_flag_invalid STATUS_VAR);
4464 return 0;
4466 av = float64_val(a);
4467 bv = float64_val(b);
4468 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4472 /*----------------------------------------------------------------------------
4473 | Returns 1 if the double-precision floating-point value `a' is less than or
4474 | equal to the corresponding value `b', and 0 otherwise. The invalid
4475 | exception is raised if either operand is a NaN. The comparison is performed
4476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4477 *----------------------------------------------------------------------------*/
4479 int float64_le( float64 a, float64 b STATUS_PARAM )
4481 flag aSign, bSign;
4482 uint64_t av, bv;
4483 a = float64_squash_input_denormal(a STATUS_VAR);
4484 b = float64_squash_input_denormal(b STATUS_VAR);
4486 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4487 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4489 float_raise( float_flag_invalid STATUS_VAR);
4490 return 0;
4492 aSign = extractFloat64Sign( a );
4493 bSign = extractFloat64Sign( b );
4494 av = float64_val(a);
4495 bv = float64_val(b);
4496 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4497 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4501 /*----------------------------------------------------------------------------
4502 | Returns 1 if the double-precision floating-point value `a' is less than
4503 | the corresponding value `b', and 0 otherwise. The invalid exception is
4504 | raised if either operand is a NaN. The comparison is performed according
4505 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4506 *----------------------------------------------------------------------------*/
4508 int float64_lt( float64 a, float64 b STATUS_PARAM )
4510 flag aSign, bSign;
4511 uint64_t av, bv;
4513 a = float64_squash_input_denormal(a STATUS_VAR);
4514 b = float64_squash_input_denormal(b STATUS_VAR);
4515 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4516 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4518 float_raise( float_flag_invalid STATUS_VAR);
4519 return 0;
4521 aSign = extractFloat64Sign( a );
4522 bSign = extractFloat64Sign( b );
4523 av = float64_val(a);
4524 bv = float64_val(b);
4525 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4526 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4532 | be compared, and 0 otherwise. The invalid exception is raised if either
4533 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4534 | Standard for Binary Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4539 a = float64_squash_input_denormal(a STATUS_VAR);
4540 b = float64_squash_input_denormal(b STATUS_VAR);
4542 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4543 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4545 float_raise( float_flag_invalid STATUS_VAR);
4546 return 1;
4548 return 0;
4551 /*----------------------------------------------------------------------------
4552 | Returns 1 if the double-precision floating-point value `a' is equal to the
4553 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4554 | exception.The comparison is performed according to the IEC/IEEE Standard
4555 | for Binary Floating-Point Arithmetic.
4556 *----------------------------------------------------------------------------*/
4558 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4560 uint64_t av, bv;
4561 a = float64_squash_input_denormal(a STATUS_VAR);
4562 b = float64_squash_input_denormal(b STATUS_VAR);
4564 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4565 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4567 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4568 float_raise( float_flag_invalid STATUS_VAR);
4570 return 0;
4572 av = float64_val(a);
4573 bv = float64_val(b);
4574 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4578 /*----------------------------------------------------------------------------
4579 | Returns 1 if the double-precision floating-point value `a' is less than or
4580 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4581 | cause an exception. Otherwise, the comparison is performed according to the
4582 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4583 *----------------------------------------------------------------------------*/
4585 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4587 flag aSign, bSign;
4588 uint64_t av, bv;
4589 a = float64_squash_input_denormal(a STATUS_VAR);
4590 b = float64_squash_input_denormal(b STATUS_VAR);
4592 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4593 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4595 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4596 float_raise( float_flag_invalid STATUS_VAR);
4598 return 0;
4600 aSign = extractFloat64Sign( a );
4601 bSign = extractFloat64Sign( b );
4602 av = float64_val(a);
4603 bv = float64_val(b);
4604 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4605 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4609 /*----------------------------------------------------------------------------
4610 | Returns 1 if the double-precision floating-point value `a' is less than
4611 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4612 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4613 | Standard for Binary Floating-Point Arithmetic.
4614 *----------------------------------------------------------------------------*/
4616 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4618 flag aSign, bSign;
4619 uint64_t av, bv;
4620 a = float64_squash_input_denormal(a STATUS_VAR);
4621 b = float64_squash_input_denormal(b STATUS_VAR);
4623 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4624 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4626 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4627 float_raise( float_flag_invalid STATUS_VAR);
4629 return 0;
4631 aSign = extractFloat64Sign( a );
4632 bSign = extractFloat64Sign( b );
4633 av = float64_val(a);
4634 bv = float64_val(b);
4635 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4636 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4640 /*----------------------------------------------------------------------------
4641 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4642 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4643 | comparison is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic.
4645 *----------------------------------------------------------------------------*/
4647 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4649 a = float64_squash_input_denormal(a STATUS_VAR);
4650 b = float64_squash_input_denormal(b STATUS_VAR);
4652 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4653 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4655 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4656 float_raise( float_flag_invalid STATUS_VAR);
4658 return 1;
4660 return 0;
4663 /*----------------------------------------------------------------------------
4664 | Returns the result of converting the extended double-precision floating-
4665 | point value `a' to the 32-bit two's complement integer format. The
4666 | conversion is performed according to the IEC/IEEE Standard for Binary
4667 | Floating-Point Arithmetic---which means in particular that the conversion
4668 | is rounded according to the current rounding mode. If `a' is a NaN, the
4669 | largest positive integer is returned. Otherwise, if the conversion
4670 | overflows, the largest integer with the same sign as `a' is returned.
4671 *----------------------------------------------------------------------------*/
4673 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4675 flag aSign;
4676 int32 aExp, shiftCount;
4677 uint64_t aSig;
4679 aSig = extractFloatx80Frac( a );
4680 aExp = extractFloatx80Exp( a );
4681 aSign = extractFloatx80Sign( a );
4682 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4683 shiftCount = 0x4037 - aExp;
4684 if ( shiftCount <= 0 ) shiftCount = 1;
4685 shift64RightJamming( aSig, shiftCount, &aSig );
4686 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the 32-bit two's complement integer format. The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic, except that the conversion is always rounded
4695 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4696 | Otherwise, if the conversion overflows, the largest integer with the same
4697 | sign as `a' is returned.
4698 *----------------------------------------------------------------------------*/
4700 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4702 flag aSign;
4703 int32 aExp, shiftCount;
4704 uint64_t aSig, savedASig;
4705 int32_t z;
4707 aSig = extractFloatx80Frac( a );
4708 aExp = extractFloatx80Exp( a );
4709 aSign = extractFloatx80Sign( a );
4710 if ( 0x401E < aExp ) {
4711 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4712 goto invalid;
4714 else if ( aExp < 0x3FFF ) {
4715 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4716 return 0;
4718 shiftCount = 0x403E - aExp;
4719 savedASig = aSig;
4720 aSig >>= shiftCount;
4721 z = aSig;
4722 if ( aSign ) z = - z;
4723 if ( ( z < 0 ) ^ aSign ) {
4724 invalid:
4725 float_raise( float_flag_invalid STATUS_VAR);
4726 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4728 if ( ( aSig<<shiftCount ) != savedASig ) {
4729 STATUS(float_exception_flags) |= float_flag_inexact;
4731 return z;
4735 /*----------------------------------------------------------------------------
4736 | Returns the result of converting the extended double-precision floating-
4737 | point value `a' to the 64-bit two's complement integer format. The
4738 | conversion is performed according to the IEC/IEEE Standard for Binary
4739 | Floating-Point Arithmetic---which means in particular that the conversion
4740 | is rounded according to the current rounding mode. If `a' is a NaN,
4741 | the largest positive integer is returned. Otherwise, if the conversion
4742 | overflows, the largest integer with the same sign as `a' is returned.
4743 *----------------------------------------------------------------------------*/
4745 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4747 flag aSign;
4748 int32 aExp, shiftCount;
4749 uint64_t aSig, aSigExtra;
4751 aSig = extractFloatx80Frac( a );
4752 aExp = extractFloatx80Exp( a );
4753 aSign = extractFloatx80Sign( a );
4754 shiftCount = 0x403E - aExp;
4755 if ( shiftCount <= 0 ) {
4756 if ( shiftCount ) {
4757 float_raise( float_flag_invalid STATUS_VAR);
4758 if ( ! aSign
4759 || ( ( aExp == 0x7FFF )
4760 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4762 return LIT64( 0x7FFFFFFFFFFFFFFF );
4764 return (int64_t) LIT64( 0x8000000000000000 );
4766 aSigExtra = 0;
4768 else {
4769 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4771 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4775 /*----------------------------------------------------------------------------
4776 | Returns the result of converting the extended double-precision floating-
4777 | point value `a' to the 64-bit two's complement integer format. The
4778 | conversion is performed according to the IEC/IEEE Standard for Binary
4779 | Floating-Point Arithmetic, except that the conversion is always rounded
4780 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4781 | Otherwise, if the conversion overflows, the largest integer with the same
4782 | sign as `a' is returned.
4783 *----------------------------------------------------------------------------*/
4785 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4787 flag aSign;
4788 int32 aExp, shiftCount;
4789 uint64_t aSig;
4790 int64 z;
4792 aSig = extractFloatx80Frac( a );
4793 aExp = extractFloatx80Exp( a );
4794 aSign = extractFloatx80Sign( a );
4795 shiftCount = aExp - 0x403E;
4796 if ( 0 <= shiftCount ) {
4797 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4798 if ( ( a.high != 0xC03E ) || aSig ) {
4799 float_raise( float_flag_invalid STATUS_VAR);
4800 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4801 return LIT64( 0x7FFFFFFFFFFFFFFF );
4804 return (int64_t) LIT64( 0x8000000000000000 );
4806 else if ( aExp < 0x3FFF ) {
4807 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4808 return 0;
4810 z = aSig>>( - shiftCount );
4811 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4812 STATUS(float_exception_flags) |= float_flag_inexact;
4814 if ( aSign ) z = - z;
4815 return z;
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the extended double-precision floating-
4821 | point value `a' to the single-precision floating-point format. The
4822 | conversion is performed according to the IEC/IEEE Standard for Binary
4823 | Floating-Point Arithmetic.
4824 *----------------------------------------------------------------------------*/
4826 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4828 flag aSign;
4829 int32 aExp;
4830 uint64_t aSig;
4832 aSig = extractFloatx80Frac( a );
4833 aExp = extractFloatx80Exp( a );
4834 aSign = extractFloatx80Sign( a );
4835 if ( aExp == 0x7FFF ) {
4836 if ( (uint64_t) ( aSig<<1 ) ) {
4837 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4839 return packFloat32( aSign, 0xFF, 0 );
4841 shift64RightJamming( aSig, 33, &aSig );
4842 if ( aExp || aSig ) aExp -= 0x3F81;
4843 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4847 /*----------------------------------------------------------------------------
4848 | Returns the result of converting the extended double-precision floating-
4849 | point value `a' to the double-precision floating-point format. The
4850 | conversion is performed according to the IEC/IEEE Standard for Binary
4851 | Floating-Point Arithmetic.
4852 *----------------------------------------------------------------------------*/
4854 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4856 flag aSign;
4857 int32 aExp;
4858 uint64_t aSig, zSig;
4860 aSig = extractFloatx80Frac( a );
4861 aExp = extractFloatx80Exp( a );
4862 aSign = extractFloatx80Sign( a );
4863 if ( aExp == 0x7FFF ) {
4864 if ( (uint64_t) ( aSig<<1 ) ) {
4865 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4867 return packFloat64( aSign, 0x7FF, 0 );
4869 shift64RightJamming( aSig, 1, &zSig );
4870 if ( aExp || aSig ) aExp -= 0x3C01;
4871 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4875 /*----------------------------------------------------------------------------
4876 | Returns the result of converting the extended double-precision floating-
4877 | point value `a' to the quadruple-precision floating-point format. The
4878 | conversion is performed according to the IEC/IEEE Standard for Binary
4879 | Floating-Point Arithmetic.
4880 *----------------------------------------------------------------------------*/
4882 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4884 flag aSign;
4885 int_fast16_t aExp;
4886 uint64_t aSig, zSig0, zSig1;
4888 aSig = extractFloatx80Frac( a );
4889 aExp = extractFloatx80Exp( a );
4890 aSign = extractFloatx80Sign( a );
4891 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4892 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4894 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4895 return packFloat128( aSign, aExp, zSig0, zSig1 );
4899 /*----------------------------------------------------------------------------
4900 | Rounds the extended double-precision floating-point value `a' to an integer,
4901 | and returns the result as an extended quadruple-precision floating-point
4902 | value. The operation is performed according to the IEC/IEEE Standard for
4903 | Binary Floating-Point Arithmetic.
4904 *----------------------------------------------------------------------------*/
4906 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4908 flag aSign;
4909 int32 aExp;
4910 uint64_t lastBitMask, roundBitsMask;
4911 floatx80 z;
4913 aExp = extractFloatx80Exp( a );
4914 if ( 0x403E <= aExp ) {
4915 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4916 return propagateFloatx80NaN( a, a STATUS_VAR );
4918 return a;
4920 if ( aExp < 0x3FFF ) {
4921 if ( ( aExp == 0 )
4922 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4923 return a;
4925 STATUS(float_exception_flags) |= float_flag_inexact;
4926 aSign = extractFloatx80Sign( a );
4927 switch ( STATUS(float_rounding_mode) ) {
4928 case float_round_nearest_even:
4929 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4931 return
4932 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4934 break;
4935 case float_round_ties_away:
4936 if (aExp == 0x3FFE) {
4937 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4939 break;
4940 case float_round_down:
4941 return
4942 aSign ?
4943 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4944 : packFloatx80( 0, 0, 0 );
4945 case float_round_up:
4946 return
4947 aSign ? packFloatx80( 1, 0, 0 )
4948 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4950 return packFloatx80( aSign, 0, 0 );
4952 lastBitMask = 1;
4953 lastBitMask <<= 0x403E - aExp;
4954 roundBitsMask = lastBitMask - 1;
4955 z = a;
4956 switch (STATUS(float_rounding_mode)) {
4957 case float_round_nearest_even:
4958 z.low += lastBitMask>>1;
4959 if ((z.low & roundBitsMask) == 0) {
4960 z.low &= ~lastBitMask;
4962 break;
4963 case float_round_ties_away:
4964 z.low += lastBitMask >> 1;
4965 break;
4966 case float_round_to_zero:
4967 break;
4968 case float_round_up:
4969 if (!extractFloatx80Sign(z)) {
4970 z.low += roundBitsMask;
4972 break;
4973 case float_round_down:
4974 if (extractFloatx80Sign(z)) {
4975 z.low += roundBitsMask;
4977 break;
4978 default:
4979 abort();
4981 z.low &= ~ roundBitsMask;
4982 if ( z.low == 0 ) {
4983 ++z.high;
4984 z.low = LIT64( 0x8000000000000000 );
4986 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4987 return z;
4991 /*----------------------------------------------------------------------------
4992 | Returns the result of adding the absolute values of the extended double-
4993 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4994 | negated before being returned. `zSign' is ignored if the result is a NaN.
4995 | The addition is performed according to the IEC/IEEE Standard for Binary
4996 | Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4999 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
5001 int32 aExp, bExp, zExp;
5002 uint64_t aSig, bSig, zSig0, zSig1;
5003 int32 expDiff;
5005 aSig = extractFloatx80Frac( a );
5006 aExp = extractFloatx80Exp( a );
5007 bSig = extractFloatx80Frac( b );
5008 bExp = extractFloatx80Exp( b );
5009 expDiff = aExp - bExp;
5010 if ( 0 < expDiff ) {
5011 if ( aExp == 0x7FFF ) {
5012 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5013 return a;
5015 if ( bExp == 0 ) --expDiff;
5016 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5017 zExp = aExp;
5019 else if ( expDiff < 0 ) {
5020 if ( bExp == 0x7FFF ) {
5021 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5022 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5024 if ( aExp == 0 ) ++expDiff;
5025 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5026 zExp = bExp;
5028 else {
5029 if ( aExp == 0x7FFF ) {
5030 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5031 return propagateFloatx80NaN( a, b STATUS_VAR );
5033 return a;
5035 zSig1 = 0;
5036 zSig0 = aSig + bSig;
5037 if ( aExp == 0 ) {
5038 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5039 goto roundAndPack;
5041 zExp = aExp;
5042 goto shiftRight1;
5044 zSig0 = aSig + bSig;
5045 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5046 shiftRight1:
5047 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5048 zSig0 |= LIT64( 0x8000000000000000 );
5049 ++zExp;
5050 roundAndPack:
5051 return
5052 roundAndPackFloatx80(
5053 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5057 /*----------------------------------------------------------------------------
5058 | Returns the result of subtracting the absolute values of the extended
5059 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5060 | difference is negated before being returned. `zSign' is ignored if the
5061 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5062 | Standard for Binary Floating-Point Arithmetic.
5063 *----------------------------------------------------------------------------*/
5065 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
5067 int32 aExp, bExp, zExp;
5068 uint64_t aSig, bSig, zSig0, zSig1;
5069 int32 expDiff;
5070 floatx80 z;
5072 aSig = extractFloatx80Frac( a );
5073 aExp = extractFloatx80Exp( a );
5074 bSig = extractFloatx80Frac( b );
5075 bExp = extractFloatx80Exp( b );
5076 expDiff = aExp - bExp;
5077 if ( 0 < expDiff ) goto aExpBigger;
5078 if ( expDiff < 0 ) goto bExpBigger;
5079 if ( aExp == 0x7FFF ) {
5080 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5081 return propagateFloatx80NaN( a, b STATUS_VAR );
5083 float_raise( float_flag_invalid STATUS_VAR);
5084 z.low = floatx80_default_nan_low;
5085 z.high = floatx80_default_nan_high;
5086 return z;
5088 if ( aExp == 0 ) {
5089 aExp = 1;
5090 bExp = 1;
5092 zSig1 = 0;
5093 if ( bSig < aSig ) goto aBigger;
5094 if ( aSig < bSig ) goto bBigger;
5095 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5096 bExpBigger:
5097 if ( bExp == 0x7FFF ) {
5098 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5099 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5101 if ( aExp == 0 ) ++expDiff;
5102 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5103 bBigger:
5104 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5105 zExp = bExp;
5106 zSign ^= 1;
5107 goto normalizeRoundAndPack;
5108 aExpBigger:
5109 if ( aExp == 0x7FFF ) {
5110 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5111 return a;
5113 if ( bExp == 0 ) --expDiff;
5114 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5115 aBigger:
5116 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5117 zExp = aExp;
5118 normalizeRoundAndPack:
5119 return
5120 normalizeRoundAndPackFloatx80(
5121 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the extended double-precision floating-point
5127 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5128 | Standard for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5131 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5133 flag aSign, bSign;
5135 aSign = extractFloatx80Sign( a );
5136 bSign = extractFloatx80Sign( b );
5137 if ( aSign == bSign ) {
5138 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5140 else {
5141 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5146 /*----------------------------------------------------------------------------
5147 | Returns the result of subtracting the extended double-precision floating-
5148 | point values `a' and `b'. The operation is performed according to the
5149 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5152 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5154 flag aSign, bSign;
5156 aSign = extractFloatx80Sign( a );
5157 bSign = extractFloatx80Sign( b );
5158 if ( aSign == bSign ) {
5159 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5161 else {
5162 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of multiplying the extended double-precision floating-
5169 | point values `a' and `b'. The operation is performed according to the
5170 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5175 flag aSign, bSign, zSign;
5176 int32 aExp, bExp, zExp;
5177 uint64_t aSig, bSig, zSig0, zSig1;
5178 floatx80 z;
5180 aSig = extractFloatx80Frac( a );
5181 aExp = extractFloatx80Exp( a );
5182 aSign = extractFloatx80Sign( a );
5183 bSig = extractFloatx80Frac( b );
5184 bExp = extractFloatx80Exp( b );
5185 bSign = extractFloatx80Sign( b );
5186 zSign = aSign ^ bSign;
5187 if ( aExp == 0x7FFF ) {
5188 if ( (uint64_t) ( aSig<<1 )
5189 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5190 return propagateFloatx80NaN( a, b STATUS_VAR );
5192 if ( ( bExp | bSig ) == 0 ) goto invalid;
5193 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5195 if ( bExp == 0x7FFF ) {
5196 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5197 if ( ( aExp | aSig ) == 0 ) {
5198 invalid:
5199 float_raise( float_flag_invalid STATUS_VAR);
5200 z.low = floatx80_default_nan_low;
5201 z.high = floatx80_default_nan_high;
5202 return z;
5204 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5206 if ( aExp == 0 ) {
5207 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5208 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5210 if ( bExp == 0 ) {
5211 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5214 zExp = aExp + bExp - 0x3FFE;
5215 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5216 if ( 0 < (int64_t) zSig0 ) {
5217 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5218 --zExp;
5220 return
5221 roundAndPackFloatx80(
5222 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5226 /*----------------------------------------------------------------------------
5227 | Returns the result of dividing the extended double-precision floating-point
5228 | value `a' by the corresponding value `b'. The operation is performed
5229 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5230 *----------------------------------------------------------------------------*/
5232 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5234 flag aSign, bSign, zSign;
5235 int32 aExp, bExp, zExp;
5236 uint64_t aSig, bSig, zSig0, zSig1;
5237 uint64_t rem0, rem1, rem2, term0, term1, term2;
5238 floatx80 z;
5240 aSig = extractFloatx80Frac( a );
5241 aExp = extractFloatx80Exp( a );
5242 aSign = extractFloatx80Sign( a );
5243 bSig = extractFloatx80Frac( b );
5244 bExp = extractFloatx80Exp( b );
5245 bSign = extractFloatx80Sign( b );
5246 zSign = aSign ^ bSign;
5247 if ( aExp == 0x7FFF ) {
5248 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5249 if ( bExp == 0x7FFF ) {
5250 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5251 goto invalid;
5253 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5255 if ( bExp == 0x7FFF ) {
5256 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5257 return packFloatx80( zSign, 0, 0 );
5259 if ( bExp == 0 ) {
5260 if ( bSig == 0 ) {
5261 if ( ( aExp | aSig ) == 0 ) {
5262 invalid:
5263 float_raise( float_flag_invalid STATUS_VAR);
5264 z.low = floatx80_default_nan_low;
5265 z.high = floatx80_default_nan_high;
5266 return z;
5268 float_raise( float_flag_divbyzero STATUS_VAR);
5269 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5271 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5273 if ( aExp == 0 ) {
5274 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5275 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5277 zExp = aExp - bExp + 0x3FFE;
5278 rem1 = 0;
5279 if ( bSig <= aSig ) {
5280 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5281 ++zExp;
5283 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5284 mul64To128( bSig, zSig0, &term0, &term1 );
5285 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5286 while ( (int64_t) rem0 < 0 ) {
5287 --zSig0;
5288 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5290 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5291 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5292 mul64To128( bSig, zSig1, &term1, &term2 );
5293 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5294 while ( (int64_t) rem1 < 0 ) {
5295 --zSig1;
5296 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5298 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5300 return
5301 roundAndPackFloatx80(
5302 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5306 /*----------------------------------------------------------------------------
5307 | Returns the remainder of the extended double-precision floating-point value
5308 | `a' with respect to the corresponding value `b'. The operation is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5312 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5314 flag aSign, zSign;
5315 int32 aExp, bExp, expDiff;
5316 uint64_t aSig0, aSig1, bSig;
5317 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5318 floatx80 z;
5320 aSig0 = extractFloatx80Frac( a );
5321 aExp = extractFloatx80Exp( a );
5322 aSign = extractFloatx80Sign( a );
5323 bSig = extractFloatx80Frac( b );
5324 bExp = extractFloatx80Exp( b );
5325 if ( aExp == 0x7FFF ) {
5326 if ( (uint64_t) ( aSig0<<1 )
5327 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5328 return propagateFloatx80NaN( a, b STATUS_VAR );
5330 goto invalid;
5332 if ( bExp == 0x7FFF ) {
5333 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5334 return a;
5336 if ( bExp == 0 ) {
5337 if ( bSig == 0 ) {
5338 invalid:
5339 float_raise( float_flag_invalid STATUS_VAR);
5340 z.low = floatx80_default_nan_low;
5341 z.high = floatx80_default_nan_high;
5342 return z;
5344 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5346 if ( aExp == 0 ) {
5347 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5348 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5350 bSig |= LIT64( 0x8000000000000000 );
5351 zSign = aSign;
5352 expDiff = aExp - bExp;
5353 aSig1 = 0;
5354 if ( expDiff < 0 ) {
5355 if ( expDiff < -1 ) return a;
5356 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5357 expDiff = 0;
5359 q = ( bSig <= aSig0 );
5360 if ( q ) aSig0 -= bSig;
5361 expDiff -= 64;
5362 while ( 0 < expDiff ) {
5363 q = estimateDiv128To64( aSig0, aSig1, bSig );
5364 q = ( 2 < q ) ? q - 2 : 0;
5365 mul64To128( bSig, q, &term0, &term1 );
5366 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5367 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5368 expDiff -= 62;
5370 expDiff += 64;
5371 if ( 0 < expDiff ) {
5372 q = estimateDiv128To64( aSig0, aSig1, bSig );
5373 q = ( 2 < q ) ? q - 2 : 0;
5374 q >>= 64 - expDiff;
5375 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5376 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5377 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5378 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5379 ++q;
5380 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5383 else {
5384 term1 = 0;
5385 term0 = bSig;
5387 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5388 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5389 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5390 && ( q & 1 ) )
5392 aSig0 = alternateASig0;
5393 aSig1 = alternateASig1;
5394 zSign = ! zSign;
5396 return
5397 normalizeRoundAndPackFloatx80(
5398 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5402 /*----------------------------------------------------------------------------
5403 | Returns the square root of the extended double-precision floating-point
5404 | value `a'. The operation is performed according to the IEC/IEEE Standard
5405 | for Binary Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5408 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5410 flag aSign;
5411 int32 aExp, zExp;
5412 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5413 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5414 floatx80 z;
5416 aSig0 = extractFloatx80Frac( a );
5417 aExp = extractFloatx80Exp( a );
5418 aSign = extractFloatx80Sign( a );
5419 if ( aExp == 0x7FFF ) {
5420 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5421 if ( ! aSign ) return a;
5422 goto invalid;
5424 if ( aSign ) {
5425 if ( ( aExp | aSig0 ) == 0 ) return a;
5426 invalid:
5427 float_raise( float_flag_invalid STATUS_VAR);
5428 z.low = floatx80_default_nan_low;
5429 z.high = floatx80_default_nan_high;
5430 return z;
5432 if ( aExp == 0 ) {
5433 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5434 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5436 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5437 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5438 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5439 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5440 doubleZSig0 = zSig0<<1;
5441 mul64To128( zSig0, zSig0, &term0, &term1 );
5442 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5443 while ( (int64_t) rem0 < 0 ) {
5444 --zSig0;
5445 doubleZSig0 -= 2;
5446 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5448 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5449 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5450 if ( zSig1 == 0 ) zSig1 = 1;
5451 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5452 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5453 mul64To128( zSig1, zSig1, &term2, &term3 );
5454 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5455 while ( (int64_t) rem1 < 0 ) {
5456 --zSig1;
5457 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5458 term3 |= 1;
5459 term2 |= doubleZSig0;
5460 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5462 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5464 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5465 zSig0 |= doubleZSig0;
5466 return
5467 roundAndPackFloatx80(
5468 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5472 /*----------------------------------------------------------------------------
5473 | Returns 1 if the extended double-precision floating-point value `a' is equal
5474 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5475 | raised if either operand is a NaN. Otherwise, the comparison is performed
5476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5477 *----------------------------------------------------------------------------*/
5479 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5482 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5483 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5484 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5485 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5487 float_raise( float_flag_invalid STATUS_VAR);
5488 return 0;
5490 return
5491 ( a.low == b.low )
5492 && ( ( a.high == b.high )
5493 || ( ( a.low == 0 )
5494 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5499 /*----------------------------------------------------------------------------
5500 | Returns 1 if the extended double-precision floating-point value `a' is
5501 | less than or equal to the corresponding value `b', and 0 otherwise. The
5502 | invalid exception is raised if either operand is a NaN. The comparison is
5503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5504 | Arithmetic.
5505 *----------------------------------------------------------------------------*/
5507 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5509 flag aSign, bSign;
5511 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5512 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5513 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5514 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5516 float_raise( float_flag_invalid STATUS_VAR);
5517 return 0;
5519 aSign = extractFloatx80Sign( a );
5520 bSign = extractFloatx80Sign( b );
5521 if ( aSign != bSign ) {
5522 return
5523 aSign
5524 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5525 == 0 );
5527 return
5528 aSign ? le128( b.high, b.low, a.high, a.low )
5529 : le128( a.high, a.low, b.high, b.low );
5533 /*----------------------------------------------------------------------------
5534 | Returns 1 if the extended double-precision floating-point value `a' is
5535 | less than the corresponding value `b', and 0 otherwise. The invalid
5536 | exception is raised if either operand is a NaN. The comparison is performed
5537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538 *----------------------------------------------------------------------------*/
5540 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5542 flag aSign, bSign;
5544 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5545 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5546 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5547 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5549 float_raise( float_flag_invalid STATUS_VAR);
5550 return 0;
5552 aSign = extractFloatx80Sign( a );
5553 bSign = extractFloatx80Sign( b );
5554 if ( aSign != bSign ) {
5555 return
5556 aSign
5557 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5558 != 0 );
5560 return
5561 aSign ? lt128( b.high, b.low, a.high, a.low )
5562 : lt128( a.high, a.low, b.high, b.low );
5566 /*----------------------------------------------------------------------------
5567 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5568 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5569 | either operand is a NaN. The comparison is performed according to the
5570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5571 *----------------------------------------------------------------------------*/
5572 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5574 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5575 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5576 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5577 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5579 float_raise( float_flag_invalid STATUS_VAR);
5580 return 1;
5582 return 0;
5585 /*----------------------------------------------------------------------------
5586 | Returns 1 if the extended double-precision floating-point value `a' is
5587 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5588 | cause an exception. The comparison is performed according to the IEC/IEEE
5589 | Standard for Binary Floating-Point Arithmetic.
5590 *----------------------------------------------------------------------------*/
5592 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5595 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5596 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5597 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5598 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5600 if ( floatx80_is_signaling_nan( a )
5601 || floatx80_is_signaling_nan( b ) ) {
5602 float_raise( float_flag_invalid STATUS_VAR);
5604 return 0;
5606 return
5607 ( a.low == b.low )
5608 && ( ( a.high == b.high )
5609 || ( ( a.low == 0 )
5610 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5615 /*----------------------------------------------------------------------------
5616 | Returns 1 if the extended double-precision floating-point value `a' is less
5617 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5618 | do not cause an exception. Otherwise, the comparison is performed according
5619 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5622 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5624 flag aSign, bSign;
5626 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5627 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5628 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5629 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5631 if ( floatx80_is_signaling_nan( a )
5632 || floatx80_is_signaling_nan( b ) ) {
5633 float_raise( float_flag_invalid STATUS_VAR);
5635 return 0;
5637 aSign = extractFloatx80Sign( a );
5638 bSign = extractFloatx80Sign( b );
5639 if ( aSign != bSign ) {
5640 return
5641 aSign
5642 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5643 == 0 );
5645 return
5646 aSign ? le128( b.high, b.low, a.high, a.low )
5647 : le128( a.high, a.low, b.high, b.low );
5651 /*----------------------------------------------------------------------------
5652 | Returns 1 if the extended double-precision floating-point value `a' is less
5653 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5654 | an exception. Otherwise, the comparison is performed according to the
5655 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5656 *----------------------------------------------------------------------------*/
5658 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5660 flag aSign, bSign;
5662 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5663 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5664 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5665 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5667 if ( floatx80_is_signaling_nan( a )
5668 || floatx80_is_signaling_nan( b ) ) {
5669 float_raise( float_flag_invalid STATUS_VAR);
5671 return 0;
5673 aSign = extractFloatx80Sign( a );
5674 bSign = extractFloatx80Sign( b );
5675 if ( aSign != bSign ) {
5676 return
5677 aSign
5678 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5679 != 0 );
5681 return
5682 aSign ? lt128( b.high, b.low, a.high, a.low )
5683 : lt128( a.high, a.low, b.high, b.low );
5687 /*----------------------------------------------------------------------------
5688 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5689 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5690 | The comparison is performed according to the IEC/IEEE Standard for Binary
5691 | Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5693 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5700 if ( floatx80_is_signaling_nan( a )
5701 || floatx80_is_signaling_nan( b ) ) {
5702 float_raise( float_flag_invalid STATUS_VAR);
5704 return 1;
5706 return 0;
5709 /*----------------------------------------------------------------------------
5710 | Returns the result of converting the quadruple-precision floating-point
5711 | value `a' to the 32-bit two's complement integer format. The conversion
5712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5713 | Arithmetic---which means in particular that the conversion is rounded
5714 | according to the current rounding mode. If `a' is a NaN, the largest
5715 | positive integer is returned. Otherwise, if the conversion overflows, the
5716 | largest integer with the same sign as `a' is returned.
5717 *----------------------------------------------------------------------------*/
5719 int32 float128_to_int32( float128 a STATUS_PARAM )
5721 flag aSign;
5722 int32 aExp, shiftCount;
5723 uint64_t aSig0, aSig1;
5725 aSig1 = extractFloat128Frac1( a );
5726 aSig0 = extractFloat128Frac0( a );
5727 aExp = extractFloat128Exp( a );
5728 aSign = extractFloat128Sign( a );
5729 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5730 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5731 aSig0 |= ( aSig1 != 0 );
5732 shiftCount = 0x4028 - aExp;
5733 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5734 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 32-bit two's complement integer format. The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero. If
5743 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5744 | conversion overflows, the largest integer with the same sign as `a' is
5745 | returned.
5746 *----------------------------------------------------------------------------*/
5748 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5750 flag aSign;
5751 int32 aExp, shiftCount;
5752 uint64_t aSig0, aSig1, savedASig;
5753 int32_t z;
5755 aSig1 = extractFloat128Frac1( a );
5756 aSig0 = extractFloat128Frac0( a );
5757 aExp = extractFloat128Exp( a );
5758 aSign = extractFloat128Sign( a );
5759 aSig0 |= ( aSig1 != 0 );
5760 if ( 0x401E < aExp ) {
5761 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5762 goto invalid;
5764 else if ( aExp < 0x3FFF ) {
5765 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5766 return 0;
5768 aSig0 |= LIT64( 0x0001000000000000 );
5769 shiftCount = 0x402F - aExp;
5770 savedASig = aSig0;
5771 aSig0 >>= shiftCount;
5772 z = aSig0;
5773 if ( aSign ) z = - z;
5774 if ( ( z < 0 ) ^ aSign ) {
5775 invalid:
5776 float_raise( float_flag_invalid STATUS_VAR);
5777 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5779 if ( ( aSig0<<shiftCount ) != savedASig ) {
5780 STATUS(float_exception_flags) |= float_flag_inexact;
5782 return z;
5786 /*----------------------------------------------------------------------------
5787 | Returns the result of converting the quadruple-precision floating-point
5788 | value `a' to the 64-bit two's complement integer format. The conversion
5789 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5790 | Arithmetic---which means in particular that the conversion is rounded
5791 | according to the current rounding mode. If `a' is a NaN, the largest
5792 | positive integer is returned. Otherwise, if the conversion overflows, the
5793 | largest integer with the same sign as `a' is returned.
5794 *----------------------------------------------------------------------------*/
5796 int64 float128_to_int64( float128 a STATUS_PARAM )
5798 flag aSign;
5799 int32 aExp, shiftCount;
5800 uint64_t aSig0, aSig1;
5802 aSig1 = extractFloat128Frac1( a );
5803 aSig0 = extractFloat128Frac0( a );
5804 aExp = extractFloat128Exp( a );
5805 aSign = extractFloat128Sign( a );
5806 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5807 shiftCount = 0x402F - aExp;
5808 if ( shiftCount <= 0 ) {
5809 if ( 0x403E < aExp ) {
5810 float_raise( float_flag_invalid STATUS_VAR);
5811 if ( ! aSign
5812 || ( ( aExp == 0x7FFF )
5813 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5816 return LIT64( 0x7FFFFFFFFFFFFFFF );
5818 return (int64_t) LIT64( 0x8000000000000000 );
5820 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5822 else {
5823 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5825 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5829 /*----------------------------------------------------------------------------
5830 | Returns the result of converting the quadruple-precision floating-point
5831 | value `a' to the 64-bit two's complement integer format. The conversion
5832 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833 | Arithmetic, except that the conversion is always rounded toward zero.
5834 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5835 | the conversion overflows, the largest integer with the same sign as `a' is
5836 | returned.
5837 *----------------------------------------------------------------------------*/
5839 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5841 flag aSign;
5842 int32 aExp, shiftCount;
5843 uint64_t aSig0, aSig1;
5844 int64 z;
5846 aSig1 = extractFloat128Frac1( a );
5847 aSig0 = extractFloat128Frac0( a );
5848 aExp = extractFloat128Exp( a );
5849 aSign = extractFloat128Sign( a );
5850 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5851 shiftCount = aExp - 0x402F;
5852 if ( 0 < shiftCount ) {
5853 if ( 0x403E <= aExp ) {
5854 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5855 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5856 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5857 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5859 else {
5860 float_raise( float_flag_invalid STATUS_VAR);
5861 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5862 return LIT64( 0x7FFFFFFFFFFFFFFF );
5865 return (int64_t) LIT64( 0x8000000000000000 );
5867 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5868 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5869 STATUS(float_exception_flags) |= float_flag_inexact;
5872 else {
5873 if ( aExp < 0x3FFF ) {
5874 if ( aExp | aSig0 | aSig1 ) {
5875 STATUS(float_exception_flags) |= float_flag_inexact;
5877 return 0;
5879 z = aSig0>>( - shiftCount );
5880 if ( aSig1
5881 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5882 STATUS(float_exception_flags) |= float_flag_inexact;
5885 if ( aSign ) z = - z;
5886 return z;
5890 /*----------------------------------------------------------------------------
5891 | Returns the result of converting the quadruple-precision floating-point
5892 | value `a' to the single-precision floating-point format. The conversion
5893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5894 | Arithmetic.
5895 *----------------------------------------------------------------------------*/
5897 float32 float128_to_float32( float128 a STATUS_PARAM )
5899 flag aSign;
5900 int32 aExp;
5901 uint64_t aSig0, aSig1;
5902 uint32_t zSig;
5904 aSig1 = extractFloat128Frac1( a );
5905 aSig0 = extractFloat128Frac0( a );
5906 aExp = extractFloat128Exp( a );
5907 aSign = extractFloat128Sign( a );
5908 if ( aExp == 0x7FFF ) {
5909 if ( aSig0 | aSig1 ) {
5910 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5912 return packFloat32( aSign, 0xFF, 0 );
5914 aSig0 |= ( aSig1 != 0 );
5915 shift64RightJamming( aSig0, 18, &aSig0 );
5916 zSig = aSig0;
5917 if ( aExp || zSig ) {
5918 zSig |= 0x40000000;
5919 aExp -= 0x3F81;
5921 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5925 /*----------------------------------------------------------------------------
5926 | Returns the result of converting the quadruple-precision floating-point
5927 | value `a' to the double-precision floating-point format. The conversion
5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929 | Arithmetic.
5930 *----------------------------------------------------------------------------*/
5932 float64 float128_to_float64( float128 a STATUS_PARAM )
5934 flag aSign;
5935 int32 aExp;
5936 uint64_t aSig0, aSig1;
5938 aSig1 = extractFloat128Frac1( a );
5939 aSig0 = extractFloat128Frac0( a );
5940 aExp = extractFloat128Exp( a );
5941 aSign = extractFloat128Sign( a );
5942 if ( aExp == 0x7FFF ) {
5943 if ( aSig0 | aSig1 ) {
5944 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5946 return packFloat64( aSign, 0x7FF, 0 );
5948 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5949 aSig0 |= ( aSig1 != 0 );
5950 if ( aExp || aSig0 ) {
5951 aSig0 |= LIT64( 0x4000000000000000 );
5952 aExp -= 0x3C01;
5954 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5958 /*----------------------------------------------------------------------------
5959 | Returns the result of converting the quadruple-precision floating-point
5960 | value `a' to the extended double-precision floating-point format. The
5961 | conversion is performed according to the IEC/IEEE Standard for Binary
5962 | Floating-Point Arithmetic.
5963 *----------------------------------------------------------------------------*/
5965 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5967 flag aSign;
5968 int32 aExp;
5969 uint64_t aSig0, aSig1;
5971 aSig1 = extractFloat128Frac1( a );
5972 aSig0 = extractFloat128Frac0( a );
5973 aExp = extractFloat128Exp( a );
5974 aSign = extractFloat128Sign( a );
5975 if ( aExp == 0x7FFF ) {
5976 if ( aSig0 | aSig1 ) {
5977 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5979 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5981 if ( aExp == 0 ) {
5982 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5983 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5985 else {
5986 aSig0 |= LIT64( 0x0001000000000000 );
5988 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5989 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5993 /*----------------------------------------------------------------------------
5994 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5995 | returns the result as a quadruple-precision floating-point value. The
5996 | operation is performed according to the IEC/IEEE Standard for Binary
5997 | Floating-Point Arithmetic.
5998 *----------------------------------------------------------------------------*/
6000 float128 float128_round_to_int( float128 a STATUS_PARAM )
6002 flag aSign;
6003 int32 aExp;
6004 uint64_t lastBitMask, roundBitsMask;
6005 float128 z;
6007 aExp = extractFloat128Exp( a );
6008 if ( 0x402F <= aExp ) {
6009 if ( 0x406F <= aExp ) {
6010 if ( ( aExp == 0x7FFF )
6011 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6013 return propagateFloat128NaN( a, a STATUS_VAR );
6015 return a;
6017 lastBitMask = 1;
6018 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6019 roundBitsMask = lastBitMask - 1;
6020 z = a;
6021 switch (STATUS(float_rounding_mode)) {
6022 case float_round_nearest_even:
6023 if ( lastBitMask ) {
6024 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6025 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6027 else {
6028 if ( (int64_t) z.low < 0 ) {
6029 ++z.high;
6030 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6033 break;
6034 case float_round_ties_away:
6035 if (lastBitMask) {
6036 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6037 } else {
6038 if ((int64_t) z.low < 0) {
6039 ++z.high;
6042 break;
6043 case float_round_to_zero:
6044 break;
6045 case float_round_up:
6046 if (!extractFloat128Sign(z)) {
6047 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6049 break;
6050 case float_round_down:
6051 if (extractFloat128Sign(z)) {
6052 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6054 break;
6055 default:
6056 abort();
6058 z.low &= ~ roundBitsMask;
6060 else {
6061 if ( aExp < 0x3FFF ) {
6062 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6063 STATUS(float_exception_flags) |= float_flag_inexact;
6064 aSign = extractFloat128Sign( a );
6065 switch ( STATUS(float_rounding_mode) ) {
6066 case float_round_nearest_even:
6067 if ( ( aExp == 0x3FFE )
6068 && ( extractFloat128Frac0( a )
6069 | extractFloat128Frac1( a ) )
6071 return packFloat128( aSign, 0x3FFF, 0, 0 );
6073 break;
6074 case float_round_ties_away:
6075 if (aExp == 0x3FFE) {
6076 return packFloat128(aSign, 0x3FFF, 0, 0);
6078 break;
6079 case float_round_down:
6080 return
6081 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6082 : packFloat128( 0, 0, 0, 0 );
6083 case float_round_up:
6084 return
6085 aSign ? packFloat128( 1, 0, 0, 0 )
6086 : packFloat128( 0, 0x3FFF, 0, 0 );
6088 return packFloat128( aSign, 0, 0, 0 );
6090 lastBitMask = 1;
6091 lastBitMask <<= 0x402F - aExp;
6092 roundBitsMask = lastBitMask - 1;
6093 z.low = 0;
6094 z.high = a.high;
6095 switch (STATUS(float_rounding_mode)) {
6096 case float_round_nearest_even:
6097 z.high += lastBitMask>>1;
6098 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6099 z.high &= ~ lastBitMask;
6101 break;
6102 case float_round_ties_away:
6103 z.high += lastBitMask>>1;
6104 break;
6105 case float_round_to_zero:
6106 break;
6107 case float_round_up:
6108 if (!extractFloat128Sign(z)) {
6109 z.high |= ( a.low != 0 );
6110 z.high += roundBitsMask;
6112 break;
6113 case float_round_down:
6114 if (extractFloat128Sign(z)) {
6115 z.high |= (a.low != 0);
6116 z.high += roundBitsMask;
6118 break;
6119 default:
6120 abort();
6122 z.high &= ~ roundBitsMask;
6124 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6125 STATUS(float_exception_flags) |= float_flag_inexact;
6127 return z;
6131 /*----------------------------------------------------------------------------
6132 | Returns the result of adding the absolute values of the quadruple-precision
6133 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6134 | before being returned. `zSign' is ignored if the result is a NaN.
6135 | The addition is performed according to the IEC/IEEE Standard for Binary
6136 | Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6139 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6141 int32 aExp, bExp, zExp;
6142 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6143 int32 expDiff;
6145 aSig1 = extractFloat128Frac1( a );
6146 aSig0 = extractFloat128Frac0( a );
6147 aExp = extractFloat128Exp( a );
6148 bSig1 = extractFloat128Frac1( b );
6149 bSig0 = extractFloat128Frac0( b );
6150 bExp = extractFloat128Exp( b );
6151 expDiff = aExp - bExp;
6152 if ( 0 < expDiff ) {
6153 if ( aExp == 0x7FFF ) {
6154 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6155 return a;
6157 if ( bExp == 0 ) {
6158 --expDiff;
6160 else {
6161 bSig0 |= LIT64( 0x0001000000000000 );
6163 shift128ExtraRightJamming(
6164 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6165 zExp = aExp;
6167 else if ( expDiff < 0 ) {
6168 if ( bExp == 0x7FFF ) {
6169 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6170 return packFloat128( zSign, 0x7FFF, 0, 0 );
6172 if ( aExp == 0 ) {
6173 ++expDiff;
6175 else {
6176 aSig0 |= LIT64( 0x0001000000000000 );
6178 shift128ExtraRightJamming(
6179 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6180 zExp = bExp;
6182 else {
6183 if ( aExp == 0x7FFF ) {
6184 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6185 return propagateFloat128NaN( a, b STATUS_VAR );
6187 return a;
6189 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6190 if ( aExp == 0 ) {
6191 if (STATUS(flush_to_zero)) {
6192 if (zSig0 | zSig1) {
6193 float_raise(float_flag_output_denormal STATUS_VAR);
6195 return packFloat128(zSign, 0, 0, 0);
6197 return packFloat128( zSign, 0, zSig0, zSig1 );
6199 zSig2 = 0;
6200 zSig0 |= LIT64( 0x0002000000000000 );
6201 zExp = aExp;
6202 goto shiftRight1;
6204 aSig0 |= LIT64( 0x0001000000000000 );
6205 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6206 --zExp;
6207 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6208 ++zExp;
6209 shiftRight1:
6210 shift128ExtraRightJamming(
6211 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6212 roundAndPack:
6213 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6217 /*----------------------------------------------------------------------------
6218 | Returns the result of subtracting the absolute values of the quadruple-
6219 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6220 | difference is negated before being returned. `zSign' is ignored if the
6221 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6222 | Standard for Binary Floating-Point Arithmetic.
6223 *----------------------------------------------------------------------------*/
6225 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6227 int32 aExp, bExp, zExp;
6228 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6229 int32 expDiff;
6230 float128 z;
6232 aSig1 = extractFloat128Frac1( a );
6233 aSig0 = extractFloat128Frac0( a );
6234 aExp = extractFloat128Exp( a );
6235 bSig1 = extractFloat128Frac1( b );
6236 bSig0 = extractFloat128Frac0( b );
6237 bExp = extractFloat128Exp( b );
6238 expDiff = aExp - bExp;
6239 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6240 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6241 if ( 0 < expDiff ) goto aExpBigger;
6242 if ( expDiff < 0 ) goto bExpBigger;
6243 if ( aExp == 0x7FFF ) {
6244 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6245 return propagateFloat128NaN( a, b STATUS_VAR );
6247 float_raise( float_flag_invalid STATUS_VAR);
6248 z.low = float128_default_nan_low;
6249 z.high = float128_default_nan_high;
6250 return z;
6252 if ( aExp == 0 ) {
6253 aExp = 1;
6254 bExp = 1;
6256 if ( bSig0 < aSig0 ) goto aBigger;
6257 if ( aSig0 < bSig0 ) goto bBigger;
6258 if ( bSig1 < aSig1 ) goto aBigger;
6259 if ( aSig1 < bSig1 ) goto bBigger;
6260 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6261 bExpBigger:
6262 if ( bExp == 0x7FFF ) {
6263 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6264 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6266 if ( aExp == 0 ) {
6267 ++expDiff;
6269 else {
6270 aSig0 |= LIT64( 0x4000000000000000 );
6272 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6273 bSig0 |= LIT64( 0x4000000000000000 );
6274 bBigger:
6275 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6276 zExp = bExp;
6277 zSign ^= 1;
6278 goto normalizeRoundAndPack;
6279 aExpBigger:
6280 if ( aExp == 0x7FFF ) {
6281 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6282 return a;
6284 if ( bExp == 0 ) {
6285 --expDiff;
6287 else {
6288 bSig0 |= LIT64( 0x4000000000000000 );
6290 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6291 aSig0 |= LIT64( 0x4000000000000000 );
6292 aBigger:
6293 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6294 zExp = aExp;
6295 normalizeRoundAndPack:
6296 --zExp;
6297 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of adding the quadruple-precision floating-point values
6303 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6304 | for Binary Floating-Point Arithmetic.
6305 *----------------------------------------------------------------------------*/
6307 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6309 flag aSign, bSign;
6311 aSign = extractFloat128Sign( a );
6312 bSign = extractFloat128Sign( b );
6313 if ( aSign == bSign ) {
6314 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6316 else {
6317 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6322 /*----------------------------------------------------------------------------
6323 | Returns the result of subtracting the quadruple-precision floating-point
6324 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6325 | Standard for Binary Floating-Point Arithmetic.
6326 *----------------------------------------------------------------------------*/
6328 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6330 flag aSign, bSign;
6332 aSign = extractFloat128Sign( a );
6333 bSign = extractFloat128Sign( b );
6334 if ( aSign == bSign ) {
6335 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6337 else {
6338 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6343 /*----------------------------------------------------------------------------
6344 | Returns the result of multiplying the quadruple-precision floating-point
6345 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6346 | Standard for Binary Floating-Point Arithmetic.
6347 *----------------------------------------------------------------------------*/
6349 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6351 flag aSign, bSign, zSign;
6352 int32 aExp, bExp, zExp;
6353 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6354 float128 z;
6356 aSig1 = extractFloat128Frac1( a );
6357 aSig0 = extractFloat128Frac0( a );
6358 aExp = extractFloat128Exp( a );
6359 aSign = extractFloat128Sign( a );
6360 bSig1 = extractFloat128Frac1( b );
6361 bSig0 = extractFloat128Frac0( b );
6362 bExp = extractFloat128Exp( b );
6363 bSign = extractFloat128Sign( b );
6364 zSign = aSign ^ bSign;
6365 if ( aExp == 0x7FFF ) {
6366 if ( ( aSig0 | aSig1 )
6367 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6368 return propagateFloat128NaN( a, b STATUS_VAR );
6370 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6371 return packFloat128( zSign, 0x7FFF, 0, 0 );
6373 if ( bExp == 0x7FFF ) {
6374 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6375 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6376 invalid:
6377 float_raise( float_flag_invalid STATUS_VAR);
6378 z.low = float128_default_nan_low;
6379 z.high = float128_default_nan_high;
6380 return z;
6382 return packFloat128( zSign, 0x7FFF, 0, 0 );
6384 if ( aExp == 0 ) {
6385 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6386 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6388 if ( bExp == 0 ) {
6389 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6390 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6392 zExp = aExp + bExp - 0x4000;
6393 aSig0 |= LIT64( 0x0001000000000000 );
6394 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6395 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6396 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6397 zSig2 |= ( zSig3 != 0 );
6398 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6399 shift128ExtraRightJamming(
6400 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6401 ++zExp;
6403 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6407 /*----------------------------------------------------------------------------
6408 | Returns the result of dividing the quadruple-precision floating-point value
6409 | `a' by the corresponding value `b'. The operation is performed according to
6410 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6413 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6415 flag aSign, bSign, zSign;
6416 int32 aExp, bExp, zExp;
6417 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6418 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6419 float128 z;
6421 aSig1 = extractFloat128Frac1( a );
6422 aSig0 = extractFloat128Frac0( a );
6423 aExp = extractFloat128Exp( a );
6424 aSign = extractFloat128Sign( a );
6425 bSig1 = extractFloat128Frac1( b );
6426 bSig0 = extractFloat128Frac0( b );
6427 bExp = extractFloat128Exp( b );
6428 bSign = extractFloat128Sign( b );
6429 zSign = aSign ^ bSign;
6430 if ( aExp == 0x7FFF ) {
6431 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6432 if ( bExp == 0x7FFF ) {
6433 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6434 goto invalid;
6436 return packFloat128( zSign, 0x7FFF, 0, 0 );
6438 if ( bExp == 0x7FFF ) {
6439 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6440 return packFloat128( zSign, 0, 0, 0 );
6442 if ( bExp == 0 ) {
6443 if ( ( bSig0 | bSig1 ) == 0 ) {
6444 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6445 invalid:
6446 float_raise( float_flag_invalid STATUS_VAR);
6447 z.low = float128_default_nan_low;
6448 z.high = float128_default_nan_high;
6449 return z;
6451 float_raise( float_flag_divbyzero STATUS_VAR);
6452 return packFloat128( zSign, 0x7FFF, 0, 0 );
6454 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6456 if ( aExp == 0 ) {
6457 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6458 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6460 zExp = aExp - bExp + 0x3FFD;
6461 shortShift128Left(
6462 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6463 shortShift128Left(
6464 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6465 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6466 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6467 ++zExp;
6469 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6470 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6471 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6472 while ( (int64_t) rem0 < 0 ) {
6473 --zSig0;
6474 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6476 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6477 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6478 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6479 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6480 while ( (int64_t) rem1 < 0 ) {
6481 --zSig1;
6482 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6484 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6486 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6487 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6491 /*----------------------------------------------------------------------------
6492 | Returns the remainder of the quadruple-precision floating-point value `a'
6493 | with respect to the corresponding value `b'. The operation is performed
6494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6495 *----------------------------------------------------------------------------*/
6497 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6499 flag aSign, zSign;
6500 int32 aExp, bExp, expDiff;
6501 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6502 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6503 int64_t sigMean0;
6504 float128 z;
6506 aSig1 = extractFloat128Frac1( a );
6507 aSig0 = extractFloat128Frac0( a );
6508 aExp = extractFloat128Exp( a );
6509 aSign = extractFloat128Sign( a );
6510 bSig1 = extractFloat128Frac1( b );
6511 bSig0 = extractFloat128Frac0( b );
6512 bExp = extractFloat128Exp( b );
6513 if ( aExp == 0x7FFF ) {
6514 if ( ( aSig0 | aSig1 )
6515 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6516 return propagateFloat128NaN( a, b STATUS_VAR );
6518 goto invalid;
6520 if ( bExp == 0x7FFF ) {
6521 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6522 return a;
6524 if ( bExp == 0 ) {
6525 if ( ( bSig0 | bSig1 ) == 0 ) {
6526 invalid:
6527 float_raise( float_flag_invalid STATUS_VAR);
6528 z.low = float128_default_nan_low;
6529 z.high = float128_default_nan_high;
6530 return z;
6532 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6534 if ( aExp == 0 ) {
6535 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6536 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6538 expDiff = aExp - bExp;
6539 if ( expDiff < -1 ) return a;
6540 shortShift128Left(
6541 aSig0 | LIT64( 0x0001000000000000 ),
6542 aSig1,
6543 15 - ( expDiff < 0 ),
6544 &aSig0,
6545 &aSig1
6547 shortShift128Left(
6548 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6549 q = le128( bSig0, bSig1, aSig0, aSig1 );
6550 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6551 expDiff -= 64;
6552 while ( 0 < expDiff ) {
6553 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6554 q = ( 4 < q ) ? q - 4 : 0;
6555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6556 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6557 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6558 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6559 expDiff -= 61;
6561 if ( -64 < expDiff ) {
6562 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563 q = ( 4 < q ) ? q - 4 : 0;
6564 q >>= - expDiff;
6565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6566 expDiff += 52;
6567 if ( expDiff < 0 ) {
6568 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6570 else {
6571 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6573 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6574 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6576 else {
6577 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6578 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6580 do {
6581 alternateASig0 = aSig0;
6582 alternateASig1 = aSig1;
6583 ++q;
6584 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6585 } while ( 0 <= (int64_t) aSig0 );
6586 add128(
6587 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6588 if ( ( sigMean0 < 0 )
6589 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6590 aSig0 = alternateASig0;
6591 aSig1 = alternateASig1;
6593 zSign = ( (int64_t) aSig0 < 0 );
6594 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6595 return
6596 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6600 /*----------------------------------------------------------------------------
6601 | Returns the square root of the quadruple-precision floating-point value `a'.
6602 | The operation is performed according to the IEC/IEEE Standard for Binary
6603 | Floating-Point Arithmetic.
6604 *----------------------------------------------------------------------------*/
6606 float128 float128_sqrt( float128 a STATUS_PARAM )
6608 flag aSign;
6609 int32 aExp, zExp;
6610 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6612 float128 z;
6614 aSig1 = extractFloat128Frac1( a );
6615 aSig0 = extractFloat128Frac0( a );
6616 aExp = extractFloat128Exp( a );
6617 aSign = extractFloat128Sign( a );
6618 if ( aExp == 0x7FFF ) {
6619 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6620 if ( ! aSign ) return a;
6621 goto invalid;
6623 if ( aSign ) {
6624 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6625 invalid:
6626 float_raise( float_flag_invalid STATUS_VAR);
6627 z.low = float128_default_nan_low;
6628 z.high = float128_default_nan_high;
6629 return z;
6631 if ( aExp == 0 ) {
6632 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6633 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6635 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6636 aSig0 |= LIT64( 0x0001000000000000 );
6637 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6638 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6639 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6640 doubleZSig0 = zSig0<<1;
6641 mul64To128( zSig0, zSig0, &term0, &term1 );
6642 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6643 while ( (int64_t) rem0 < 0 ) {
6644 --zSig0;
6645 doubleZSig0 -= 2;
6646 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6648 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6649 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6650 if ( zSig1 == 0 ) zSig1 = 1;
6651 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6652 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6653 mul64To128( zSig1, zSig1, &term2, &term3 );
6654 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6655 while ( (int64_t) rem1 < 0 ) {
6656 --zSig1;
6657 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6658 term3 |= 1;
6659 term2 |= doubleZSig0;
6660 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6662 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6664 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6665 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6669 /*----------------------------------------------------------------------------
6670 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6671 | the corresponding value `b', and 0 otherwise. The invalid exception is
6672 | raised if either operand is a NaN. Otherwise, the comparison is performed
6673 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6674 *----------------------------------------------------------------------------*/
6676 int float128_eq( float128 a, float128 b STATUS_PARAM )
6679 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6680 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6681 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6682 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6684 float_raise( float_flag_invalid STATUS_VAR);
6685 return 0;
6687 return
6688 ( a.low == b.low )
6689 && ( ( a.high == b.high )
6690 || ( ( a.low == 0 )
6691 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6696 /*----------------------------------------------------------------------------
6697 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6698 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6699 | exception is raised if either operand is a NaN. The comparison is performed
6700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6701 *----------------------------------------------------------------------------*/
6703 int float128_le( float128 a, float128 b STATUS_PARAM )
6705 flag aSign, bSign;
6707 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6708 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6709 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6710 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6712 float_raise( float_flag_invalid STATUS_VAR);
6713 return 0;
6715 aSign = extractFloat128Sign( a );
6716 bSign = extractFloat128Sign( b );
6717 if ( aSign != bSign ) {
6718 return
6719 aSign
6720 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6721 == 0 );
6723 return
6724 aSign ? le128( b.high, b.low, a.high, a.low )
6725 : le128( a.high, a.low, b.high, b.low );
6729 /*----------------------------------------------------------------------------
6730 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6731 | the corresponding value `b', and 0 otherwise. The invalid exception is
6732 | raised if either operand is a NaN. The comparison is performed according
6733 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6734 *----------------------------------------------------------------------------*/
6736 int float128_lt( float128 a, float128 b STATUS_PARAM )
6738 flag aSign, bSign;
6740 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6741 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6742 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6743 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6745 float_raise( float_flag_invalid STATUS_VAR);
6746 return 0;
6748 aSign = extractFloat128Sign( a );
6749 bSign = extractFloat128Sign( b );
6750 if ( aSign != bSign ) {
6751 return
6752 aSign
6753 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6754 != 0 );
6756 return
6757 aSign ? lt128( b.high, b.low, a.high, a.low )
6758 : lt128( a.high, a.low, b.high, b.low );
6762 /*----------------------------------------------------------------------------
6763 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6764 | be compared, and 0 otherwise. The invalid exception is raised if either
6765 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6766 | Standard for Binary Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6769 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6771 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6772 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6773 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6774 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6776 float_raise( float_flag_invalid STATUS_VAR);
6777 return 1;
6779 return 0;
6782 /*----------------------------------------------------------------------------
6783 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6784 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6785 | exception. The comparison is performed according to the IEC/IEEE Standard
6786 | for Binary Floating-Point Arithmetic.
6787 *----------------------------------------------------------------------------*/
6789 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6792 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6793 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6794 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6795 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6797 if ( float128_is_signaling_nan( a )
6798 || float128_is_signaling_nan( b ) ) {
6799 float_raise( float_flag_invalid STATUS_VAR);
6801 return 0;
6803 return
6804 ( a.low == b.low )
6805 && ( ( a.high == b.high )
6806 || ( ( a.low == 0 )
6807 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6812 /*----------------------------------------------------------------------------
6813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6814 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6815 | cause an exception. Otherwise, the comparison is performed according to the
6816 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6817 *----------------------------------------------------------------------------*/
6819 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6821 flag aSign, bSign;
6823 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6825 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6826 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6828 if ( float128_is_signaling_nan( a )
6829 || float128_is_signaling_nan( b ) ) {
6830 float_raise( float_flag_invalid STATUS_VAR);
6832 return 0;
6834 aSign = extractFloat128Sign( a );
6835 bSign = extractFloat128Sign( b );
6836 if ( aSign != bSign ) {
6837 return
6838 aSign
6839 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6840 == 0 );
6842 return
6843 aSign ? le128( b.high, b.low, a.high, a.low )
6844 : le128( a.high, a.low, b.high, b.low );
6848 /*----------------------------------------------------------------------------
6849 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6850 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6851 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6852 | Standard for Binary Floating-Point Arithmetic.
6853 *----------------------------------------------------------------------------*/
6855 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6857 flag aSign, bSign;
6859 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6860 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6861 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6862 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6864 if ( float128_is_signaling_nan( a )
6865 || float128_is_signaling_nan( b ) ) {
6866 float_raise( float_flag_invalid STATUS_VAR);
6868 return 0;
6870 aSign = extractFloat128Sign( a );
6871 bSign = extractFloat128Sign( b );
6872 if ( aSign != bSign ) {
6873 return
6874 aSign
6875 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6876 != 0 );
6878 return
6879 aSign ? lt128( b.high, b.low, a.high, a.low )
6880 : lt128( a.high, a.low, b.high, b.low );
6884 /*----------------------------------------------------------------------------
6885 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6886 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6887 | comparison is performed according to the IEC/IEEE Standard for Binary
6888 | Floating-Point Arithmetic.
6889 *----------------------------------------------------------------------------*/
6891 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6893 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6894 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6895 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6896 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6898 if ( float128_is_signaling_nan( a )
6899 || float128_is_signaling_nan( b ) ) {
6900 float_raise( float_flag_invalid STATUS_VAR);
6902 return 1;
6904 return 0;
6907 /* misc functions */
6908 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6910 return int64_to_float32(a STATUS_VAR);
6913 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6915 return int64_to_float64(a STATUS_VAR);
6918 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6920 int64_t v;
6921 uint32 res;
6922 int old_exc_flags = get_float_exception_flags(status);
6924 v = float32_to_int64(a STATUS_VAR);
6925 if (v < 0) {
6926 res = 0;
6927 } else if (v > 0xffffffff) {
6928 res = 0xffffffff;
6929 } else {
6930 return v;
6932 set_float_exception_flags(old_exc_flags, status);
6933 float_raise(float_flag_invalid STATUS_VAR);
6934 return res;
6937 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6939 int64_t v;
6940 uint32 res;
6941 int old_exc_flags = get_float_exception_flags(status);
6943 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6944 if (v < 0) {
6945 res = 0;
6946 } else if (v > 0xffffffff) {
6947 res = 0xffffffff;
6948 } else {
6949 return v;
6951 set_float_exception_flags(old_exc_flags, status);
6952 float_raise(float_flag_invalid STATUS_VAR);
6953 return res;
6956 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6958 int32_t v;
6959 int_fast16_t res;
6960 int old_exc_flags = get_float_exception_flags(status);
6962 v = float32_to_int32(a STATUS_VAR);
6963 if (v < -0x8000) {
6964 res = -0x8000;
6965 } else if (v > 0x7fff) {
6966 res = 0x7fff;
6967 } else {
6968 return v;
6971 set_float_exception_flags(old_exc_flags, status);
6972 float_raise(float_flag_invalid STATUS_VAR);
6973 return res;
6976 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6978 int32_t v;
6979 uint_fast16_t res;
6980 int old_exc_flags = get_float_exception_flags(status);
6982 v = float32_to_int32(a STATUS_VAR);
6983 if (v < 0) {
6984 res = 0;
6985 } else if (v > 0xffff) {
6986 res = 0xffff;
6987 } else {
6988 return v;
6991 set_float_exception_flags(old_exc_flags, status);
6992 float_raise(float_flag_invalid STATUS_VAR);
6993 return res;
6996 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6998 int64_t v;
6999 uint_fast16_t res;
7000 int old_exc_flags = get_float_exception_flags(status);
7002 v = float32_to_int64_round_to_zero(a STATUS_VAR);
7003 if (v < 0) {
7004 res = 0;
7005 } else if (v > 0xffff) {
7006 res = 0xffff;
7007 } else {
7008 return v;
7010 set_float_exception_flags(old_exc_flags, status);
7011 float_raise(float_flag_invalid STATUS_VAR);
7012 return res;
7015 uint32 float64_to_uint32( float64 a STATUS_PARAM )
7017 uint64_t v;
7018 uint32 res;
7019 int old_exc_flags = get_float_exception_flags(status);
7021 v = float64_to_uint64(a STATUS_VAR);
7022 if (v > 0xffffffff) {
7023 res = 0xffffffff;
7024 } else {
7025 return v;
7027 set_float_exception_flags(old_exc_flags, status);
7028 float_raise(float_flag_invalid STATUS_VAR);
7029 return res;
7032 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
7034 uint64_t v;
7035 uint32 res;
7036 int old_exc_flags = get_float_exception_flags(status);
7038 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
7039 if (v > 0xffffffff) {
7040 res = 0xffffffff;
7041 } else {
7042 return v;
7044 set_float_exception_flags(old_exc_flags, status);
7045 float_raise(float_flag_invalid STATUS_VAR);
7046 return res;
7049 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
7051 int64_t v;
7052 int_fast16_t res;
7053 int old_exc_flags = get_float_exception_flags(status);
7055 v = float64_to_int32(a STATUS_VAR);
7056 if (v < -0x8000) {
7057 res = -0x8000;
7058 } else if (v > 0x7fff) {
7059 res = 0x7fff;
7060 } else {
7061 return v;
7064 set_float_exception_flags(old_exc_flags, status);
7065 float_raise(float_flag_invalid STATUS_VAR);
7066 return res;
7069 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
7071 int64_t v;
7072 uint_fast16_t res;
7073 int old_exc_flags = get_float_exception_flags(status);
7075 v = float64_to_int32(a STATUS_VAR);
7076 if (v < 0) {
7077 res = 0;
7078 } else if (v > 0xffff) {
7079 res = 0xffff;
7080 } else {
7081 return v;
7084 set_float_exception_flags(old_exc_flags, status);
7085 float_raise(float_flag_invalid STATUS_VAR);
7086 return res;
7089 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
7091 int64_t v;
7092 uint_fast16_t res;
7093 int old_exc_flags = get_float_exception_flags(status);
7095 v = float64_to_int64_round_to_zero(a STATUS_VAR);
7096 if (v < 0) {
7097 res = 0;
7098 } else if (v > 0xffff) {
7099 res = 0xffff;
7100 } else {
7101 return v;
7103 set_float_exception_flags(old_exc_flags, status);
7104 float_raise(float_flag_invalid STATUS_VAR);
7105 return res;
7108 /*----------------------------------------------------------------------------
7109 | Returns the result of converting the double-precision floating-point value
7110 | `a' to the 64-bit unsigned integer format. The conversion is
7111 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7112 | Arithmetic---which means in particular that the conversion is rounded
7113 | according to the current rounding mode. If `a' is a NaN, the largest
7114 | positive integer is returned. If the conversion overflows, the
7115 | largest unsigned integer is returned. If 'a' is negative, the value is
7116 | rounded and zero is returned; negative values that do not round to zero
7117 | will raise the inexact exception.
7118 *----------------------------------------------------------------------------*/
7120 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7122 flag aSign;
7123 int_fast16_t aExp, shiftCount;
7124 uint64_t aSig, aSigExtra;
7125 a = float64_squash_input_denormal(a STATUS_VAR);
7127 aSig = extractFloat64Frac(a);
7128 aExp = extractFloat64Exp(a);
7129 aSign = extractFloat64Sign(a);
7130 if (aSign && (aExp > 1022)) {
7131 float_raise(float_flag_invalid STATUS_VAR);
7132 if (float64_is_any_nan(a)) {
7133 return LIT64(0xFFFFFFFFFFFFFFFF);
7134 } else {
7135 return 0;
7138 if (aExp) {
7139 aSig |= LIT64(0x0010000000000000);
7141 shiftCount = 0x433 - aExp;
7142 if (shiftCount <= 0) {
7143 if (0x43E < aExp) {
7144 float_raise(float_flag_invalid STATUS_VAR);
7145 return LIT64(0xFFFFFFFFFFFFFFFF);
7147 aSigExtra = 0;
7148 aSig <<= -shiftCount;
7149 } else {
7150 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7152 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7155 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7157 signed char current_rounding_mode = STATUS(float_rounding_mode);
7158 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7159 int64_t v = float64_to_uint64(a STATUS_VAR);
7160 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7161 return v;
7164 #define COMPARE(s, nan_exp) \
7165 static inline int float ## s ## _compare_internal( float ## s a, float ## s b, \
7166 int is_quiet STATUS_PARAM ) \
7168 flag aSign, bSign; \
7169 uint ## s ## _t av, bv; \
7170 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7171 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7173 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7174 extractFloat ## s ## Frac( a ) ) || \
7175 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7176 extractFloat ## s ## Frac( b ) )) { \
7177 if (!is_quiet || \
7178 float ## s ## _is_signaling_nan( a ) || \
7179 float ## s ## _is_signaling_nan( b ) ) { \
7180 float_raise( float_flag_invalid STATUS_VAR); \
7182 return float_relation_unordered; \
7184 aSign = extractFloat ## s ## Sign( a ); \
7185 bSign = extractFloat ## s ## Sign( b ); \
7186 av = float ## s ## _val(a); \
7187 bv = float ## s ## _val(b); \
7188 if ( aSign != bSign ) { \
7189 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7190 /* zero case */ \
7191 return float_relation_equal; \
7192 } else { \
7193 return 1 - (2 * aSign); \
7195 } else { \
7196 if (av == bv) { \
7197 return float_relation_equal; \
7198 } else { \
7199 return 1 - 2 * (aSign ^ ( av < bv )); \
7204 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
7206 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7209 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
7211 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7214 COMPARE(32, 0xff)
7215 COMPARE(64, 0x7ff)
7217 static inline int floatx80_compare_internal( floatx80 a, floatx80 b,
7218 int is_quiet STATUS_PARAM )
7220 flag aSign, bSign;
7222 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7223 ( extractFloatx80Frac( a )<<1 ) ) ||
7224 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7225 ( extractFloatx80Frac( b )<<1 ) )) {
7226 if (!is_quiet ||
7227 floatx80_is_signaling_nan( a ) ||
7228 floatx80_is_signaling_nan( b ) ) {
7229 float_raise( float_flag_invalid STATUS_VAR);
7231 return float_relation_unordered;
7233 aSign = extractFloatx80Sign( a );
7234 bSign = extractFloatx80Sign( b );
7235 if ( aSign != bSign ) {
7237 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7238 ( ( a.low | b.low ) == 0 ) ) {
7239 /* zero case */
7240 return float_relation_equal;
7241 } else {
7242 return 1 - (2 * aSign);
7244 } else {
7245 if (a.low == b.low && a.high == b.high) {
7246 return float_relation_equal;
7247 } else {
7248 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7253 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7255 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7258 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7260 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7263 static inline int float128_compare_internal( float128 a, float128 b,
7264 int is_quiet STATUS_PARAM )
7266 flag aSign, bSign;
7268 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7269 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7270 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7271 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7272 if (!is_quiet ||
7273 float128_is_signaling_nan( a ) ||
7274 float128_is_signaling_nan( b ) ) {
7275 float_raise( float_flag_invalid STATUS_VAR);
7277 return float_relation_unordered;
7279 aSign = extractFloat128Sign( a );
7280 bSign = extractFloat128Sign( b );
7281 if ( aSign != bSign ) {
7282 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7283 /* zero case */
7284 return float_relation_equal;
7285 } else {
7286 return 1 - (2 * aSign);
7288 } else {
7289 if (a.low == b.low && a.high == b.high) {
7290 return float_relation_equal;
7291 } else {
7292 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7297 int float128_compare( float128 a, float128 b STATUS_PARAM )
7299 return float128_compare_internal(a, b, 0 STATUS_VAR);
7302 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7304 return float128_compare_internal(a, b, 1 STATUS_VAR);
7307 /* min() and max() functions. These can't be implemented as
7308 * 'compare and pick one input' because that would mishandle
7309 * NaNs and +0 vs -0.
7311 * minnum() and maxnum() functions. These are similar to the min()
7312 * and max() functions but if one of the arguments is a QNaN and
7313 * the other is numerical then the numerical argument is returned.
7314 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7315 * and maxNum() operations. min() and max() are the typical min/max
7316 * semantics provided by many CPUs which predate that specification.
7318 * minnummag() and maxnummag() functions correspond to minNumMag()
7319 * and minNumMag() from the IEEE-754 2008.
7321 #define MINMAX(s) \
7322 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7323 int ismin, int isieee, \
7324 int ismag STATUS_PARAM) \
7326 flag aSign, bSign; \
7327 uint ## s ## _t av, bv, aav, abv; \
7328 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7329 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7330 if (float ## s ## _is_any_nan(a) || \
7331 float ## s ## _is_any_nan(b)) { \
7332 if (isieee) { \
7333 if (float ## s ## _is_quiet_nan(a) && \
7334 !float ## s ##_is_any_nan(b)) { \
7335 return b; \
7336 } else if (float ## s ## _is_quiet_nan(b) && \
7337 !float ## s ## _is_any_nan(a)) { \
7338 return a; \
7341 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7343 aSign = extractFloat ## s ## Sign(a); \
7344 bSign = extractFloat ## s ## Sign(b); \
7345 av = float ## s ## _val(a); \
7346 bv = float ## s ## _val(b); \
7347 if (ismag) { \
7348 aav = float ## s ## _abs(av); \
7349 abv = float ## s ## _abs(bv); \
7350 if (aav != abv) { \
7351 if (ismin) { \
7352 return (aav < abv) ? a : b; \
7353 } else { \
7354 return (aav < abv) ? b : a; \
7358 if (aSign != bSign) { \
7359 if (ismin) { \
7360 return aSign ? a : b; \
7361 } else { \
7362 return aSign ? b : a; \
7364 } else { \
7365 if (ismin) { \
7366 return (aSign ^ (av < bv)) ? a : b; \
7367 } else { \
7368 return (aSign ^ (av < bv)) ? b : a; \
7373 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7375 return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR); \
7378 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7380 return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR); \
7383 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7385 return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR); \
7388 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7390 return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR); \
7393 float ## s float ## s ## _minnummag(float ## s a, float ## s b STATUS_PARAM) \
7395 return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR); \
7398 float ## s float ## s ## _maxnummag(float ## s a, float ## s b STATUS_PARAM) \
7400 return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR); \
7403 MINMAX(32)
7404 MINMAX(64)
7407 /* Multiply A by 2 raised to the power N. */
7408 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7410 flag aSign;
7411 int16_t aExp;
7412 uint32_t aSig;
7414 a = float32_squash_input_denormal(a STATUS_VAR);
7415 aSig = extractFloat32Frac( a );
7416 aExp = extractFloat32Exp( a );
7417 aSign = extractFloat32Sign( a );
7419 if ( aExp == 0xFF ) {
7420 if ( aSig ) {
7421 return propagateFloat32NaN( a, a STATUS_VAR );
7423 return a;
7425 if (aExp != 0) {
7426 aSig |= 0x00800000;
7427 } else if (aSig == 0) {
7428 return a;
7429 } else {
7430 aExp++;
7433 if (n > 0x200) {
7434 n = 0x200;
7435 } else if (n < -0x200) {
7436 n = -0x200;
7439 aExp += n - 1;
7440 aSig <<= 7;
7441 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7444 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7446 flag aSign;
7447 int16_t aExp;
7448 uint64_t aSig;
7450 a = float64_squash_input_denormal(a STATUS_VAR);
7451 aSig = extractFloat64Frac( a );
7452 aExp = extractFloat64Exp( a );
7453 aSign = extractFloat64Sign( a );
7455 if ( aExp == 0x7FF ) {
7456 if ( aSig ) {
7457 return propagateFloat64NaN( a, a STATUS_VAR );
7459 return a;
7461 if (aExp != 0) {
7462 aSig |= LIT64( 0x0010000000000000 );
7463 } else if (aSig == 0) {
7464 return a;
7465 } else {
7466 aExp++;
7469 if (n > 0x1000) {
7470 n = 0x1000;
7471 } else if (n < -0x1000) {
7472 n = -0x1000;
7475 aExp += n - 1;
7476 aSig <<= 10;
7477 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7480 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7482 flag aSign;
7483 int32_t aExp;
7484 uint64_t aSig;
7486 aSig = extractFloatx80Frac( a );
7487 aExp = extractFloatx80Exp( a );
7488 aSign = extractFloatx80Sign( a );
7490 if ( aExp == 0x7FFF ) {
7491 if ( aSig<<1 ) {
7492 return propagateFloatx80NaN( a, a STATUS_VAR );
7494 return a;
7497 if (aExp == 0) {
7498 if (aSig == 0) {
7499 return a;
7501 aExp++;
7504 if (n > 0x10000) {
7505 n = 0x10000;
7506 } else if (n < -0x10000) {
7507 n = -0x10000;
7510 aExp += n;
7511 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7512 aSign, aExp, aSig, 0 STATUS_VAR );
7515 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7517 flag aSign;
7518 int32_t aExp;
7519 uint64_t aSig0, aSig1;
7521 aSig1 = extractFloat128Frac1( a );
7522 aSig0 = extractFloat128Frac0( a );
7523 aExp = extractFloat128Exp( a );
7524 aSign = extractFloat128Sign( a );
7525 if ( aExp == 0x7FFF ) {
7526 if ( aSig0 | aSig1 ) {
7527 return propagateFloat128NaN( a, a STATUS_VAR );
7529 return a;
7531 if (aExp != 0) {
7532 aSig0 |= LIT64( 0x0001000000000000 );
7533 } else if (aSig0 == 0 && aSig1 == 0) {
7534 return a;
7535 } else {
7536 aExp++;
7539 if (n > 0x10000) {
7540 n = 0x10000;
7541 } else if (n < -0x10000) {
7542 n = -0x10000;
7545 aExp += n - 1;
7546 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7547 STATUS_VAR );