1 //===-- lib/truncdfsf2.c - double -> single conversion ------------*- C -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements a fairly generic conversion from a wider to a narrower
11 // IEEE-754 floating-point type in the default (round to nearest, ties to even)
12 // rounding mode. The constants and types defined following the includes below
13 // parameterize the conversion.
15 // This routine can be trivially adapted to support conversions to
16 // half-precision or from quad-precision. It does not support types that don't
17 // use the usual IEEE-754 interchange formats; specifically, some work would be
18 // needed to adapt it to (for example) the Intel 80-bit format or PowerPC
19 // double-double format.
21 // Note please, however, that this implementation is only intended to support
22 // *narrowing* operations; if you need to convert to a *wider* floating-point
23 // type (e.g. float -> double), then this routine will not do what you want it
26 // It also requires that integer types at least as large as both formats
27 // are available on the target platform; this may pose a problem when trying
28 // to add support for quad on some 32-bit systems, for example.
30 // Finally, the following assumptions are made:
32 // 1. floating-point types and integer types have the same endianness on the
35 // 2. quiet NaNs, if supported, are indicated by the leading bit of the
36 // significand field being set
38 //===----------------------------------------------------------------------===//
43 typedef uint64_t src_rep_t
;
44 #define SRC_REP_C UINT64_C
45 static const int srcSigBits
= 52;
48 typedef uint32_t dst_rep_t
;
49 #define DST_REP_C UINT32_C
50 static const int dstSigBits
= 23;
52 // End of specialization parameters. Two helper routines for conversion to and
53 // from the representation of floating-point data as integer values follow.
55 static inline src_rep_t
srcToRep(src_t x
) {
56 const union { src_t f
; src_rep_t i
; } rep
= {.f
= x
};
60 static inline dst_t
dstFromRep(dst_rep_t x
) {
61 const union { dst_t f
; dst_rep_t i
; } rep
= {.i
= x
};
65 // End helper routines. Conversion implementation follows.
67 ARM_EABI_FNALIAS(d2f
, truncdfsf2
);
70 __truncdfsf2(src_t a
) {
72 // Various constants whose values follow from the type parameters.
73 // Any reasonable optimizer will fold and propagate all of these.
74 const int srcBits
= sizeof(src_t
)*CHAR_BIT
;
75 const int srcExpBits
= srcBits
- srcSigBits
- 1;
76 const int srcInfExp
= (1 << srcExpBits
) - 1;
77 const int srcExpBias
= srcInfExp
>> 1;
79 const src_rep_t srcMinNormal
= SRC_REP_C(1) << srcSigBits
;
80 const src_rep_t significandMask
= srcMinNormal
- 1;
81 const src_rep_t srcInfinity
= (src_rep_t
)srcInfExp
<< srcSigBits
;
82 const src_rep_t srcSignMask
= SRC_REP_C(1) << (srcSigBits
+ srcExpBits
);
83 const src_rep_t srcAbsMask
= srcSignMask
- 1;
84 const src_rep_t roundMask
= (SRC_REP_C(1) << (srcSigBits
- dstSigBits
)) - 1;
85 const src_rep_t halfway
= SRC_REP_C(1) << (srcSigBits
- dstSigBits
- 1);
87 const int dstBits
= sizeof(dst_t
)*CHAR_BIT
;
88 const int dstExpBits
= dstBits
- dstSigBits
- 1;
89 const int dstInfExp
= (1 << dstExpBits
) - 1;
90 const int dstExpBias
= dstInfExp
>> 1;
92 const int underflowExponent
= srcExpBias
+ 1 - dstExpBias
;
93 const int overflowExponent
= srcExpBias
+ dstInfExp
- dstExpBias
;
94 const src_rep_t underflow
= (src_rep_t
)underflowExponent
<< srcSigBits
;
95 const src_rep_t overflow
= (src_rep_t
)overflowExponent
<< srcSigBits
;
97 const dst_rep_t dstQNaN
= DST_REP_C(1) << (dstSigBits
- 1);
98 const dst_rep_t dstNaNCode
= dstQNaN
- 1;
100 // Break a into a sign and representation of the absolute value
101 const src_rep_t aRep
= srcToRep(a
);
102 const src_rep_t aAbs
= aRep
& srcAbsMask
;
103 const src_rep_t sign
= aRep
& srcSignMask
;
106 if (aAbs
- underflow
< aAbs
- overflow
) {
107 // The exponent of a is within the range of normal numbers in the
108 // destination format. We can convert by simply right-shifting with
109 // rounding and adjusting the exponent.
110 absResult
= aAbs
>> (srcSigBits
- dstSigBits
);
111 absResult
-= (dst_rep_t
)(srcExpBias
- dstExpBias
) << dstSigBits
;
113 const src_rep_t roundBits
= aAbs
& roundMask
;
116 if (roundBits
> halfway
)
120 else if (roundBits
== halfway
)
121 absResult
+= absResult
& 1;
124 else if (aAbs
> srcInfinity
) {
126 // Conjure the result by beginning with infinity, setting the qNaN
127 // bit and inserting the (truncated) trailing NaN field.
128 absResult
= (dst_rep_t
)dstInfExp
<< dstSigBits
;
129 absResult
|= dstQNaN
;
130 absResult
|= aAbs
& dstNaNCode
;
133 else if (aAbs
> overflow
) {
134 // a overflows to infinity.
135 absResult
= (dst_rep_t
)dstInfExp
<< dstSigBits
;
139 // a underflows on conversion to the destination type or is an exact
140 // zero. The result may be a denormal or zero. Extract the exponent
141 // to get the shift amount for the denormalization.
142 const int aExp
= aAbs
>> srcSigBits
;
143 const int shift
= srcExpBias
- dstExpBias
- aExp
+ 1;
145 const src_rep_t significand
= (aRep
& significandMask
) | srcMinNormal
;
147 // Right shift by the denormalization amount with sticky.
148 if (shift
> srcSigBits
) {
151 const bool sticky
= significand
<< (srcBits
- shift
);
152 src_rep_t denormalizedSignificand
= significand
>> shift
| sticky
;
153 absResult
= denormalizedSignificand
>> (srcSigBits
- dstSigBits
);
154 const src_rep_t roundBits
= denormalizedSignificand
& roundMask
;
156 if (roundBits
> halfway
)
159 else if (roundBits
== halfway
)
160 absResult
+= absResult
& 1;
164 // Apply the signbit to (dst_t)abs(a).
165 const dst_rep_t result
= absResult
| sign
>> (srcBits
- dstBits
);
166 return dstFromRep(result
);