1 /* Single-precision vector (Advanced SIMD) cos function.
3 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 static const struct data
25 float32x4_t range_val
, inv_pi
, half_pi
, shift
, pi_1
, pi_2
, pi_3
;
27 /* 1.886 ulp error. */
28 .poly
= { V4 (-0x1.555548p
-3f
), V4 (0x1.110df4p
-7f
), V4 (-0x1.9f42eap
-13f
),
29 V4 (0x1.5b2e76p
-19f
) },
31 .pi_1
= V4 (0x1.921fb6p
+1f
),
32 .pi_2
= V4 (-0x1.777a5cp
-24f
),
33 .pi_3
= V4 (-0x1.ee59dap
-49f
),
35 .inv_pi
= V4 (0x1.45f306p
-2f
),
36 .shift
= V4 (0x1.8p
+23f
),
37 .half_pi
= V4 (0x1.921fb6p0f
),
38 .range_val
= V4 (0x1p
20f
)
41 #define C(i) d->poly[i]
43 static float32x4_t VPCS_ATTR NOINLINE
44 special_case (float32x4_t x
, float32x4_t y
, uint32x4_t odd
, uint32x4_t cmp
)
46 /* Fall back to scalar code. */
47 y
= vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y
), odd
));
48 return v_call_f32 (cosf
, x
, y
, cmp
);
51 float32x4_t VPCS_ATTR NOINLINE
V_NAME_F1 (cos
) (float32x4_t x
)
53 const struct data
*d
= ptr_barrier (&data
);
54 float32x4_t n
, r
, r2
, r3
, y
;
59 cmp
= vcgeq_u32 (vreinterpretq_u32_f32 (r
),
60 vreinterpretq_u32_f32 (d
->range_val
));
61 if (__glibc_unlikely (v_any_u32 (cmp
)))
62 /* If fenv exceptions are to be triggered correctly, set any special lanes
63 to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
64 special-case handler later. */
65 r
= vbslq_f32 (cmp
, v_f32 (1.0f
), r
);
67 cmp
= vcageq_f32 (x
, d
->range_val
);
71 /* n = rint((|x|+pi/2)/pi) - 0.5. */
72 n
= vfmaq_f32 (d
->shift
, d
->inv_pi
, vaddq_f32 (r
, d
->half_pi
));
73 odd
= vshlq_n_u32 (vreinterpretq_u32_f32 (n
), 31);
74 n
= vsubq_f32 (n
, d
->shift
);
75 n
= vsubq_f32 (n
, v_f32 (0.5f
));
77 /* r = |x| - n*pi (range reduction into -pi/2 .. pi/2). */
78 r
= vfmsq_f32 (r
, d
->pi_1
, n
);
79 r
= vfmsq_f32 (r
, d
->pi_2
, n
);
80 r
= vfmsq_f32 (r
, d
->pi_3
, n
);
83 r2
= vmulq_f32 (r
, r
);
84 r3
= vmulq_f32 (r2
, r
);
85 y
= vfmaq_f32 (C (2), C (3), r2
);
86 y
= vfmaq_f32 (C (1), y
, r2
);
87 y
= vfmaq_f32 (C (0), y
, r2
);
88 y
= vfmaq_f32 (r
, y
, r3
);
90 if (__glibc_unlikely (v_any_u32 (cmp
)))
91 return special_case (x
, y
, odd
, cmp
);
92 return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y
), odd
));
94 libmvec_hidden_def (V_NAME_F1 (cos
))
95 HALF_WIDTH_ALIAS_F1 (cos
)