1 /* Function asin vectorized with AVX2.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * asin(x) = (SelMask ? (Pi/2 - 2*Poly(R)) : Poly(R))*(-1)^sign(x)
28 /* Offsets for data table __svml_dasin_data_internal
35 #define sqrt_coeff 160
36 #define poly_coeff 288
41 .section .text.avx2, "ax", @progbits
42 ENTRY(_ZGVdN4v_asin_avx2)
44 cfi_def_cfa_offset(16)
50 vmovupd __svml_dasin_data_internal(%rip), %ymm6
51 vmovupd OneHalf+__svml_dasin_data_internal(%rip), %ymm10
52 vmovupd One+__svml_dasin_data_internal(%rip), %ymm8
56 vandpd %ymm5, %ymm6, %ymm4
59 vmovapd %ymm10, %ymm15
60 vfnmadd231pd %ymm4, %ymm10, %ymm15
63 vmulpd %ymm4, %ymm4, %ymm7
64 vcmplt_oqpd %ymm4, %ymm8, %ymm9
67 vcmplt_oqpd SmallNorm+__svml_dasin_data_internal(%rip), %ymm15, %ymm13
68 vminpd %ymm15, %ymm7, %ymm2
69 vaddpd %ymm15, %ymm15, %ymm7
70 vcmpnlt_uqpd %ymm10, %ymm4, %ymm1
71 vcvtpd2ps %ymm15, %xmm11
72 vmovupd poly_coeff+64+__svml_dasin_data_internal(%rip), %ymm10
73 vmulpd %ymm2, %ymm2, %ymm15
74 vrsqrtps %xmm11, %xmm12
75 vmovupd poly_coeff+192+__svml_dasin_data_internal(%rip), %ymm11
76 vfmadd213pd poly_coeff+96+__svml_dasin_data_internal(%rip), %ymm2, %ymm10
77 vcvtps2pd %xmm12, %ymm14
78 vmulpd %ymm15, %ymm15, %ymm12
79 vfmadd213pd poly_coeff+224+__svml_dasin_data_internal(%rip), %ymm2, %ymm11
80 vandnpd %ymm14, %ymm13, %ymm0
81 vandnpd %ymm5, %ymm6, %ymm3
82 vmulpd %ymm0, %ymm0, %ymm6
83 vmovupd poly_coeff+128+__svml_dasin_data_internal(%rip), %ymm13
84 vmovupd poly_coeff+256+__svml_dasin_data_internal(%rip), %ymm14
85 vfmadd213pd poly_coeff+160+__svml_dasin_data_internal(%rip), %ymm2, %ymm13
86 vfmadd213pd poly_coeff+288+__svml_dasin_data_internal(%rip), %ymm2, %ymm14
87 vfmadd213pd %ymm11, %ymm15, %ymm13
89 vmulpd %ymm7, %ymm0, %ymm9
90 vfmsub213pd Two+__svml_dasin_data_internal(%rip), %ymm6, %ymm7
93 vmovupd poly_coeff+__svml_dasin_data_internal(%rip), %ymm6
94 vmovupd sqrt_coeff+__svml_dasin_data_internal(%rip), %ymm0
95 vmulpd %ymm7, %ymm9, %ymm8
96 vfmadd213pd poly_coeff+32+__svml_dasin_data_internal(%rip), %ymm2, %ymm6
97 vfmadd213pd sqrt_coeff+32+__svml_dasin_data_internal(%rip), %ymm7, %ymm0
98 vfmadd213pd %ymm10, %ymm15, %ymm6
99 vmovupd poly_coeff+320+__svml_dasin_data_internal(%rip), %ymm10
100 vfmadd213pd sqrt_coeff+64+__svml_dasin_data_internal(%rip), %ymm7, %ymm0
101 vfmadd213pd %ymm13, %ymm12, %ymm6
102 vfmadd213pd poly_coeff+352+__svml_dasin_data_internal(%rip), %ymm2, %ymm10
103 vfmadd213pd sqrt_coeff+96+__svml_dasin_data_internal(%rip), %ymm7, %ymm0
104 vfmadd213pd %ymm14, %ymm15, %ymm6
105 vfmsub213pd %ymm9, %ymm8, %ymm0
106 vfmadd213pd %ymm10, %ymm15, %ymm6
107 vblendvpd %ymm1, %ymm0, %ymm4, %ymm4
108 vmulpd %ymm6, %ymm2, %ymm2
109 vfmadd213pd %ymm4, %ymm4, %ymm2
110 vandpd Pi2H+__svml_dasin_data_internal(%rip), %ymm1, %ymm1
111 vaddpd %ymm2, %ymm1, %ymm0
112 vxorpd %ymm3, %ymm0, %ymm0
115 /* Go to special inputs processing branch */
116 jne L(SPECIAL_VALUES_BRANCH)
117 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm5
120 * and exit the function
136 L(SPECIAL_VALUES_BRANCH):
137 vmovupd %ymm5, 32(%rsp)
138 vmovupd %ymm0, 64(%rsp)
139 # LOE rbx r12 r13 r14 r15 edx ymm0
142 # LOE rbx r12 r13 r14 r15 eax edx
146 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
147 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
150 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
151 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
154 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
155 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
156 # LOE rbx r15 r12d r13d
165 /* Call scalar math function */
166 jc L(SCALAR_MATH_CALL)
167 # LOE rbx r15 r12d r13d
173 L(SPECIAL_VALUES_LOOP):
177 /* Check bits in range mask */
178 jl L(RANGEMASK_CHECK)
179 # LOE rbx r15 r12d r13d
187 vmovupd 64(%rsp), %ymm0
191 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
192 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
193 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
194 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
195 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
196 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
197 # LOE rbx r12 r13 r14 r15 ymm0
199 /* Scalar math fucntion call
200 * to process special input
205 vmovsd 32(%rsp, %r14, 8), %xmm0
207 # LOE rbx r14 r15 r12d r13d xmm0
209 vmovsd %xmm0, 64(%rsp, %r14, 8)
211 /* Process special inputs in loop */
212 jmp L(SPECIAL_VALUES_LOOP)
213 # LOE rbx r15 r12d r13d
214 END(_ZGVdN4v_asin_avx2)
216 .section .rodata, "a"
219 #ifdef __svml_dasin_data_internal_typedef
220 typedef unsigned int VUINT32;
222 __declspec(align(32)) VUINT32 AbsMask[4][2];
223 __declspec(align(32)) VUINT32 OneHalf[4][2];
224 __declspec(align(32)) VUINT32 SmallNorm[4][2];
225 __declspec(align(32)) VUINT32 One[4][2];
226 __declspec(align(32)) VUINT32 Two[4][2];
227 __declspec(align(32)) VUINT32 sqrt_coeff[4][4][2];
228 __declspec(align(32)) VUINT32 poly_coeff[12][4][2];
229 __declspec(align(32)) VUINT32 Pi2H[4][2];
230 } __svml_dasin_data_internal;
232 __svml_dasin_data_internal:
234 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
237 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
240 .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
243 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
246 .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
249 .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
250 .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
251 .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
252 .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
255 .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
256 .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
257 .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
258 .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
259 .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
260 .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
261 .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
262 .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
263 .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
264 .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
265 .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
266 .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
269 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
271 .type __svml_dasin_data_internal, @object
272 .size __svml_dasin_data_internal, .-__svml_dasin_data_internal