1 /* Function acosf vectorized with AVX2.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
30 /* Offsets for data table __svml_sacos_data_internal
37 #define sqrt_coeff 160
38 #define poly_coeff 224
44 .section .text.avx2, "ax", @progbits
45 ENTRY(_ZGVdN8v_acosf_avx2)
47 cfi_def_cfa_offset(16)
55 * 2*sqrt(X) ~ Sh - Sl (to 24+ bits)
58 vmovups __svml_sacos_data_internal(%rip), %ymm6
59 vmovups OneHalf+__svml_sacos_data_internal(%rip), %ymm7
63 vorps %ymm5, %ymm6, %ymm4
65 /* Y = 0.5 + 0.5*(-x) */
66 vfmadd231ps %ymm4, %ymm7, %ymm7
69 vmulps %ymm4, %ymm4, %ymm8
72 vmovups sqrt_coeff+__svml_sacos_data_internal(%rip), %ymm0
73 vcmpnge_uqps MOne+__svml_sacos_data_internal(%rip), %ymm4, %ymm9
74 vcmplt_oqps SmallNorm+__svml_sacos_data_internal(%rip), %ymm7, %ymm10
75 vminps %ymm7, %ymm8, %ymm2
76 vaddps %ymm7, %ymm7, %ymm14
77 vrsqrtps %ymm7, %ymm11
78 vmovups poly_coeff+64+__svml_sacos_data_internal(%rip), %ymm8
79 vcmpnlt_uqps %ymm7, %ymm2, %ymm1
80 vmulps %ymm2, %ymm2, %ymm7
81 vfmadd213ps poly_coeff+96+__svml_sacos_data_internal(%rip), %ymm2, %ymm8
85 vmovups poly_coeff+__svml_sacos_data_internal(%rip), %ymm9
86 vandnps %ymm11, %ymm10, %ymm12
87 vmulps %ymm12, %ymm12, %ymm13
88 vfmadd213ps poly_coeff+32+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
91 vcmplt_oqps %ymm2, %ymm5, %ymm10
92 vfmadd213ps %ymm8, %ymm7, %ymm9
93 vandps %ymm5, %ymm6, %ymm3
94 vmulps %ymm14, %ymm12, %ymm6
95 vfmsub213ps Two+__svml_sacos_data_internal(%rip), %ymm13, %ymm14
96 vfmadd213ps poly_coeff+128+__svml_sacos_data_internal(%rip), %ymm2, %ymm9
97 vfmadd213ps sqrt_coeff+32+__svml_sacos_data_internal(%rip), %ymm14, %ymm0
98 vmulps %ymm14, %ymm6, %ymm15
99 vmulps %ymm9, %ymm2, %ymm14
100 vfnmadd213ps %ymm6, %ymm15, %ymm0
101 vblendvps %ymm1, %ymm0, %ymm4, %ymm0
102 vandps PiH+__svml_sacos_data_internal(%rip), %ymm1, %ymm2
103 vandnps Pi2H+__svml_sacos_data_internal(%rip), %ymm1, %ymm12
104 vxorps %ymm3, %ymm0, %ymm1
105 vfmadd213ps %ymm1, %ymm1, %ymm14
106 vandps %ymm10, %ymm2, %ymm11
107 vaddps %ymm12, %ymm11, %ymm13
108 vaddps %ymm14, %ymm13, %ymm0
111 /* Go to special inputs processing branch */
112 jne L(SPECIAL_VALUES_BRANCH)
113 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
116 * and exit the function
132 L(SPECIAL_VALUES_BRANCH):
133 vmovups %ymm5, 32(%rsp)
134 vmovups %ymm0, 64(%rsp)
135 # LOE rbx r12 r13 r14 r15 edx ymm0
138 # LOE rbx r12 r13 r14 r15 eax edx
142 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
143 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
146 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
150 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
151 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
152 # LOE rbx r15 r12d r13d
161 /* Call scalar math function */
162 jc L(SCALAR_MATH_CALL)
163 # LOE rbx r15 r12d r13d
169 L(SPECIAL_VALUES_LOOP):
173 /* Check bits in range mask */
174 jl L(RANGEMASK_CHECK)
175 # LOE rbx r15 r12d r13d
183 vmovups 64(%rsp), %ymm0
187 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
188 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
189 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
190 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
191 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
193 # LOE rbx r12 r13 r14 r15 ymm0
195 /* Scalar math fucntion call
196 * to process special input
201 vmovss 32(%rsp, %r14, 4), %xmm0
203 # LOE rbx r14 r15 r12d r13d xmm0
205 vmovss %xmm0, 64(%rsp, %r14, 4)
207 /* Process special inputs in loop */
208 jmp L(SPECIAL_VALUES_LOOP)
209 # LOE rbx r15 r12d r13d
210 END(_ZGVdN8v_acosf_avx2)
212 .section .rodata, "a"
215 #ifdef __svml_sacos_data_internal_typedef
216 typedef unsigned int VUINT32;
218 __declspec(align(32)) VUINT32 SgnBit[8][1];
219 __declspec(align(32)) VUINT32 OneHalf[8][1];
220 __declspec(align(32)) VUINT32 SmallNorm[8][1];
221 __declspec(align(32)) VUINT32 MOne[8][1];
222 __declspec(align(32)) VUINT32 Two[8][1];
223 __declspec(align(32)) VUINT32 sqrt_coeff[2][8][1];
224 __declspec(align(32)) VUINT32 poly_coeff[5][8][1];
225 __declspec(align(32)) VUINT32 Pi2H[8][1];
226 __declspec(align(32)) VUINT32 PiH[8][1];
227 } __svml_sacos_data_internal;
229 __svml_sacos_data_internal:
231 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
234 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
237 .long 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000, 0x2f800000
240 .long 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000, 0xbf800000
243 .long 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000, 0x40000000
246 .long 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004, 0xbdC00004 /* sqrt_coeff2 */
247 .long 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001, 0x3e800001 /* sqrt_coeff1 */
250 .long 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07, 0x3d2EDC07 /* poly_coeff5 */
251 .long 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B, 0x3CC32A6B /* poly_coeff4 */
252 .long 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4, 0x3d3A9AB4 /* poly_coeff3 */
253 .long 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12, 0x3d997C12 /* poly_coeff2 */
254 .long 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF, 0x3e2AAAFF /* poly_coeff1 */
257 .long 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB, 0x3fc90FDB
260 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
262 .type __svml_sacos_data_internal, @object
263 .size __svml_sacos_data_internal, .-__svml_sacos_data_internal