4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
30 * __vcosf: single precision vector cos
34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 * z*C2))), where z = x*x, all evaluated in double precision.
40 * The largest error is less than 0.6 ulps.
43 #include <sys/isa_defs.h>
46 #define HI(x) *(1+(int *)&x)
47 #define LO(x) *(unsigned *)&x
49 #define HI(x) *(int *)&x
50 #define LO(x) *(1+(unsigned *)&x)
54 #define restrict _Restrict
59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
61 static const double C
[] = {
62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */
63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */
64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */
67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */
68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */
69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */
70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */
71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */
72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */
73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */
89 #define PREPROCESS(N, index, label) \
91 ix = hx & 0x7fffffff; \
94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \
101 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \
105 if (ix >= 0x7f800000) { /* inf or nan */ \
109 z##N = y##N = (double)t; \
111 n##N = ((hx >> 20) & 0x7ff) - 1046; \
112 HI(z##N) = (hx & 0xfffff) | 0x41600000; \
113 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1; \
114 z##N = y##N * y##N; \
115 if (n##N & 1) { /* compute cos y */ \
116 f##N = (float)(one + z##N * (mhalf + z##N * \
117 (C0 + z##N * (C1 + z##N * C2)))); \
118 } else { /* compute sin y */ \
119 f##N = (float)(y##N + y##N * z##N * (S0 + \
120 z##N * (S1 + z##N * S2))); \
122 y[index] = (n##N & 2)? -f##N : f##N; \
128 z##N = y##N * invpio2 + c3two51; \
129 n##N = LO(z##N) + 1; \
131 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \
133 z##N = y##N * y##N; \
134 if (n##N & 1) { /* compute cos y */ \
135 f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \
136 z##N * (C1 + z##N * C2)))); \
137 } else { /* compute sin y */ \
138 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \
141 *y = (n##N & 2)? -f##N : f##N; \
145 __vcosf(int n
, float *restrict x
, int stridex
, float *restrict y
,
148 double y0
, y1
, y2
, y3
;
149 double z0
, z1
, z2
, z3
;
150 float f0
, f1
, f2
, f3
, t
;
151 int n0
= 0, n1
= 0, n2
= 0, n3
, hx
, ix
, medium
;
163 PREPROCESS(0, 0, begin
);
168 PREPROCESS(1, stridey
, process1
);
173 PREPROCESS(2, (stridey
<< 1), process2
);
178 PREPROCESS(3, (stridey
<< 1) + stridey
, process3
);
181 z0
= y0
* invpio2
+ c3two51
;
182 z1
= y1
* invpio2
+ c3two51
;
183 z2
= y2
* invpio2
+ c3two51
;
184 z3
= y3
* invpio2
+ c3two51
;
196 y0
= (y0
- z0
* pio2_1
) - z0
* pio2_t
;
197 y1
= (y1
- z1
* pio2_1
) - z1
* pio2_t
;
198 y2
= (y2
- z2
* pio2_1
) - z2
* pio2_t
;
199 y3
= (y3
- z3
* pio2_1
) - z3
* pio2_t
;
207 hx
= (n0
& 1) | ((n1
& 1) << 1) | ((n2
& 1) << 2) |
211 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
212 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
213 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
214 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
218 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
219 z0
* (C1
+ z0
* C2
))));
220 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
221 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
222 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
226 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
227 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
228 z1
* (C1
+ z1
* C2
))));
229 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
230 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
234 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
235 z0
* (C1
+ z0
* C2
))));
236 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
237 z1
* (C1
+ z1
* C2
))));
238 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
239 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
243 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
244 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
245 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
246 z2
* (C1
+ z2
* C2
))));
247 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
251 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
252 z0
* (C1
+ z0
* C2
))));
253 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
254 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
255 z2
* (C1
+ z2
* C2
))));
256 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
260 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
261 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
262 z1
* (C1
+ z1
* C2
))));
263 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
264 z2
* (C1
+ z2
* C2
))));
265 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
269 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
270 z0
* (C1
+ z0
* C2
))));
271 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
272 z1
* (C1
+ z1
* C2
))));
273 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
274 z2
* (C1
+ z2
* C2
))));
275 f3
= (float)(y3
+ y3
* z3
* (S0
+ z3
* (S1
+ z3
* S2
)));
279 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
280 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
281 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
282 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
283 z3
* (C1
+ z3
* C2
))));
287 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
288 z0
* (C1
+ z0
* C2
))));
289 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
290 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
291 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
292 z3
* (C1
+ z3
* C2
))));
296 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
297 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
298 z1
* (C1
+ z1
* C2
))));
299 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
300 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
301 z3
* (C1
+ z3
* C2
))));
305 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
306 z0
* (C1
+ z0
* C2
))));
307 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
308 z1
* (C1
+ z1
* C2
))));
309 f2
= (float)(y2
+ y2
* z2
* (S0
+ z2
* (S1
+ z2
* S2
)));
310 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
311 z3
* (C1
+ z3
* C2
))));
315 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
316 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
317 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
318 z2
* (C1
+ z2
* C2
))));
319 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
320 z3
* (C1
+ z3
* C2
))));
324 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
325 z0
* (C1
+ z0
* C2
))));
326 f1
= (float)(y1
+ y1
* z1
* (S0
+ z1
* (S1
+ z1
* S2
)));
327 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
328 z2
* (C1
+ z2
* C2
))));
329 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
330 z3
* (C1
+ z3
* C2
))));
334 f0
= (float)(y0
+ y0
* z0
* (S0
+ z0
* (S1
+ z0
* S2
)));
335 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
336 z1
* (C1
+ z1
* C2
))));
337 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
338 z2
* (C1
+ z2
* C2
))));
339 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
340 z3
* (C1
+ z3
* C2
))));
344 f0
= (float)(one
+ z0
* (mhalf
+ z0
* (C0
+
345 z0
* (C1
+ z0
* C2
))));
346 f1
= (float)(one
+ z1
* (mhalf
+ z1
* (C0
+
347 z1
* (C1
+ z1
* C2
))));
348 f2
= (float)(one
+ z2
* (mhalf
+ z2
* (C0
+
349 z2
* (C1
+ z2
* C2
))));
350 f3
= (float)(one
+ z3
* (mhalf
+ z3
* (C0
+
351 z3
* (C1
+ z3
* C2
))));
354 *y
= (n0
& 2)? -f0
: f0
;
356 *y
= (n1
& 2)? -f1
: f1
;
358 *y
= (n2
& 2)? -f2
: f2
;
360 *y
= (n3
& 2)? -f3
: f3
;