1 /* Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 This file is free software; you can redistribute it and/or modify it under
4 the terms of the GNU General Public License as published by the Free
5 Software Foundation; either version 3 of the License, or (at your option)
8 This file is distributed in the hope that it will be useful, but WITHOUT
9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 Under Section 7 of GPL version 3, you are granted additional
14 permissions described in the GCC Runtime Library Exception, version
15 3.1, as published by the Free Software Foundation.
17 You should have received a copy of the GNU General Public License and
18 a copy of the GCC Runtime Library Exception along with this program;
19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
20 <http://www.gnu.org/licenses/>. */
22 #include <spu_intrinsics.h>
24 vector
double __divv2df3 (vector
double a_in
, vector
double b_in
);
26 /* __divv2df3 divides the vector dividend a by the vector divisor b and
27 returns the resulting vector quotient. Maximum error about 0.5 ulp
28 over entire double range including denorms, compared to true result
29 in round-to-nearest rounding mode. Handles Inf or NaN operands and
33 __divv2df3 (vector
double a_in
, vector
double b_in
)
36 vec_int4 exp
, exp_bias
;
37 vec_uint4 no_underflow
, overflow
;
38 vec_float4 mant_bf
, inv_bf
;
39 vec_ullong2 exp_a
, exp_b
;
40 vec_ullong2 a_nan
, a_zero
, a_inf
, a_denorm
, a_denorm0
;
41 vec_ullong2 b_nan
, b_zero
, b_inf
, b_denorm
, b_denorm0
;
43 vec_uint4 a_exp
, b_exp
;
44 vec_ullong2 a_mant_0
, b_mant_0
;
45 vec_ullong2 a_exp_1s
, b_exp_1s
;
46 vec_ullong2 sign_exp_mask
;
49 vec_double2 mant_a
, mant_b
, inv_b
, q0
, q1
, q2
, mult
;
52 vec_uint4 exp_mask_u32
= spu_splats((unsigned int)0x7FF00000);
53 vec_uchar16 splat_hi
= (vec_uchar16
){0,1,2,3, 0,1,2,3, 8, 9,10,11, 8,9,10,11};
54 vec_uchar16 swap_32
= (vec_uchar16
){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
55 vec_ullong2 exp_mask
= spu_splats(0x7FF0000000000000ULL
);
56 vec_ullong2 sign_mask
= spu_splats(0x8000000000000000ULL
);
57 vec_float4 onef
= spu_splats(1.0f
);
58 vec_double2 one
= spu_splats(1.0);
59 vec_double2 exp_53
= (vec_double2
)spu_splats(0x0350000000000000ULL
);
61 sign_exp_mask
= spu_or(sign_mask
, exp_mask
);
63 /* Extract the floating point components from each of the operands including
64 * exponent and mantissa.
66 a_exp
= (vec_uint4
)spu_and((vec_uint4
)a_in
, exp_mask_u32
);
67 a_exp
= spu_shuffle(a_exp
, a_exp
, splat_hi
);
68 b_exp
= (vec_uint4
)spu_and((vec_uint4
)b_in
, exp_mask_u32
);
69 b_exp
= spu_shuffle(b_exp
, b_exp
, splat_hi
);
71 a_mant_0
= (vec_ullong2
)spu_cmpeq((vec_uint4
)spu_andc((vec_ullong2
)a_in
, sign_exp_mask
), 0);
72 a_mant_0
= spu_and(a_mant_0
, spu_shuffle(a_mant_0
, a_mant_0
, swap_32
));
74 b_mant_0
= (vec_ullong2
)spu_cmpeq((vec_uint4
)spu_andc((vec_ullong2
)b_in
, sign_exp_mask
), 0);
75 b_mant_0
= spu_and(b_mant_0
, spu_shuffle(b_mant_0
, b_mant_0
, swap_32
));
77 a_exp_1s
= (vec_ullong2
)spu_cmpeq(a_exp
, exp_mask_u32
);
78 b_exp_1s
= (vec_ullong2
)spu_cmpeq(b_exp
, exp_mask_u32
);
80 /* Identify all possible special values that must be accommodated including:
81 * +-denorm, +-0, +-infinity, and NaNs.
83 a_denorm0
= (vec_ullong2
)spu_cmpeq(a_exp
, 0);
84 a_nan
= spu_andc(a_exp_1s
, a_mant_0
);
85 a_zero
= spu_and (a_denorm0
, a_mant_0
);
86 a_inf
= spu_and (a_exp_1s
, a_mant_0
);
87 a_denorm
= spu_andc(a_denorm0
, a_zero
);
89 b_denorm0
= (vec_ullong2
)spu_cmpeq(b_exp
, 0);
90 b_nan
= spu_andc(b_exp_1s
, b_mant_0
);
91 b_zero
= spu_and (b_denorm0
, b_mant_0
);
92 b_inf
= spu_and (b_exp_1s
, b_mant_0
);
93 b_denorm
= spu_andc(b_denorm0
, b_zero
);
95 /* Scale denorm inputs to into normalized numbers by conditionally scaling the
98 a
= spu_sub(spu_or(a_in
, exp_53
), spu_sel(exp_53
, a_in
, sign_mask
));
99 a
= spu_sel(a_in
, a
, a_denorm
);
101 b
= spu_sub(spu_or(b_in
, exp_53
), spu_sel(exp_53
, b_in
, sign_mask
));
102 b
= spu_sel(b_in
, b
, b_denorm
);
104 /* Extract the divisor and dividend exponent and force parameters into the signed
105 * range [1.0,2.0) or [-1.0,2.0).
107 exp_a
= spu_and((vec_ullong2
)a
, exp_mask
);
108 exp_b
= spu_and((vec_ullong2
)b
, exp_mask
);
110 mant_a
= spu_sel(a
, one
, (vec_ullong2
)exp_mask
);
111 mant_b
= spu_sel(b
, one
, (vec_ullong2
)exp_mask
);
113 /* Approximate the single reciprocal of b by using
114 * the single precision reciprocal estimate followed by one
115 * single precision iteration of Newton-Raphson.
117 mant_bf
= spu_roundtf(mant_b
);
118 inv_bf
= spu_re(mant_bf
);
119 inv_bf
= spu_madd(spu_nmsub(mant_bf
, inv_bf
, onef
), inv_bf
, inv_bf
);
121 /* Perform 2 more Newton-Raphson iterations in double precision. The
122 * result (q1) is in the range (0.5, 2.0).
124 inv_b
= spu_extend(inv_bf
);
125 inv_b
= spu_madd(spu_nmsub(mant_b
, inv_b
, one
), inv_b
, inv_b
);
126 q0
= spu_mul(mant_a
, inv_b
);
127 q1
= spu_madd(spu_nmsub(mant_b
, q0
, mant_a
), inv_b
, q0
);
129 /* Determine the exponent correction factor that must be applied
130 * to q1 by taking into account the exponent of the normalized inputs
131 * and the scale factors that were applied to normalize them.
133 exp
= spu_rlmaska(spu_sub((vec_int4
)exp_a
, (vec_int4
)exp_b
), -20);
134 exp
= spu_add(exp
, (vec_int4
)spu_add(spu_and((vec_int4
)a_denorm
, -0x34), spu_and((vec_int4
)b_denorm
, 0x34)));
136 /* Bias the quotient exponent depending on the sign of the exponent correction
137 * factor so that a single multiplier will ensure the entire double precision
138 * domain (including denorms) can be achieved.
140 * exp bias q1 adjust exp
141 * ===== ======== ==========
145 exp_bias
= spu_xor(spu_rlmaska(exp
, -31), 64);
146 exp
= spu_sub(exp
, exp_bias
);
148 q1
= spu_sel(q1
, (vec_double2
)spu_add((vec_int4
)q1
, spu_sl(exp_bias
, 20)), exp_mask
);
150 /* Compute a multiplier (mult) to applied to the quotient (q1) to produce the
151 * expected result. On overflow, clamp the multiplier to the maximum non-infinite
152 * number in case the rounding mode is not round-to-nearest.
154 exp
= spu_add(exp
, 0x3FF);
155 no_underflow
= spu_cmpgt(exp
, 0);
156 overflow
= spu_cmpgt(exp
, 0x7FE);
157 exp
= spu_and(spu_sl(exp
, 20), (vec_int4
)no_underflow
);
158 exp
= spu_and(exp
, (vec_int4
)exp_mask
);
160 mult
= spu_sel((vec_double2
)exp
, (vec_double2
)(spu_add((vec_uint4
)exp_mask
, -1)), (vec_ullong2
)overflow
);
162 /* Handle special value conditions. These include:
164 * 1) IF either operand is a NaN OR both operands are 0 or INFINITY THEN a NaN
166 * 2) ELSE IF the dividend is an INFINITY OR the divisor is 0 THEN a INFINITY results.
167 * 3) ELSE IF the dividend is 0 OR the divisor is INFINITY THEN a 0 results.
169 mult
= spu_andc(mult
, (vec_double2
)spu_or(a_zero
, b_inf
));
170 mult
= spu_sel(mult
, (vec_double2
)exp_mask
, spu_or(a_inf
, b_zero
));
172 nan
= spu_or(a_nan
, b_nan
);
173 nan
= spu_or(nan
, spu_and(a_zero
, b_zero
));
174 nan
= spu_or(nan
, spu_and(a_inf
, b_inf
));
176 mult
= spu_or(mult
, (vec_double2
)nan
);
178 /* Scale the final quotient */
180 q2
= spu_mul(q1
, mult
);
186 /* We use the same function for vector and scalar division. Provide the
187 scalar entry point as an alias. */
188 double __divdf3 (double a
, double b
)
189 __attribute__ ((__alias__ ("__divv2df3")));
191 /* Some toolchain builds used the __fast_divdf3 name for this helper function.
192 Provide this as another alternate entry point for compatibility. */
193 double __fast_divdf3 (double a
, double b
)
194 __attribute__ ((__alias__ ("__divv2df3")));