1 /* Function atan vectorized with SSE4.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
23 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
24 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
25 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
26 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
27 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
31 /* Offsets for data table __svml_datan_data_internal_avx512
35 #define MaxThreshold 32
49 .section .text.sse4, "ax", @progbits
50 ENTRY(_ZGVbN2v_atan_sse4)
51 lea Tbl_H+128+__svml_datan_data_internal_avx512(%rip), %rcx
52 movups __svml_datan_data_internal_avx512(%rip), %xmm4
53 movups Shifter+__svml_datan_data_internal_avx512(%rip), %xmm3
61 * table lookup sequence
62 * VPERMUTE not available
68 movups MaxThreshold+__svml_datan_data_internal_avx512(%rip), %xmm2
71 /* saturate X range */
72 movups LargeX+__svml_datan_data_internal_avx512(%rip), %xmm8
75 addpd One+__svml_datan_data_internal_avx512(%rip), %xmm7
77 movups MOne+__svml_datan_data_internal_avx512(%rip), %xmm6
89 pand .FLT_11(%rip), %xmm10
91 /* set table value to Pi/2 for large X */
92 movups Pi2+__svml_datan_data_internal_avx512(%rip), %xmm4
95 pshufd $2, %xmm10, %xmm11
98 /* polynomial evaluation */
102 movups coeff+__svml_datan_data_internal_avx512(%rip), %xmm5
104 movups coeff+32+__svml_datan_data_internal_avx512(%rip), %xmm6
108 addpd coeff+16+__svml_datan_data_internal_avx512(%rip), %xmm5
111 addpd coeff+48+__svml_datan_data_internal_avx512(%rip), %xmm6
114 movups coeff+64+__svml_datan_data_internal_avx512(%rip), %xmm8
117 addpd coeff+80+__svml_datan_data_internal_avx512(%rip), %xmm8
120 movups dIndexMed+__svml_datan_data_internal_avx512(%rip), %xmm14
121 cmplepd %xmm12, %xmm14
126 movsd -128(%rax, %rcx), %xmm13
127 movsd (%rcx, %rax), %xmm15
128 movhpd -128(%rdx, %rcx), %xmm13
129 movhpd (%rcx, %rdx), %xmm15
139 END(_ZGVbN2v_atan_sse4)
141 .section .rodata, "a"
144 #ifdef __svml_datan_data_internal_avx512_typedef
145 typedef unsigned int VUINT32;
147 __declspec(align(16)) VUINT32 AbsMask[2][2];
148 __declspec(align(16)) VUINT32 Shifter[2][2];
149 __declspec(align(16)) VUINT32 MaxThreshold[2][2];
150 __declspec(align(16)) VUINT32 MOne[2][2];
151 __declspec(align(16)) VUINT32 One[2][2];
152 __declspec(align(16)) VUINT32 LargeX[2][2];
153 __declspec(align(16)) VUINT32 Zero[2][2];
154 __declspec(align(16)) VUINT32 Tbl_H[32][2];
155 __declspec(align(16)) VUINT32 Tbl_L[32][2];
156 __declspec(align(16)) VUINT32 dIndexMed[2][2];
157 __declspec(align(16)) VUINT32 Pi2[2][2];
158 __declspec(align(16)) VUINT32 Pi2_low[2][2];
159 __declspec(align(16)) VUINT32 coeff[6][2][2];
160 } __svml_datan_data_internal_avx512;
162 __svml_datan_data_internal_avx512:
164 .quad 0x7fffffffffffffff, 0x7fffffffffffffff
167 .quad 0x4318000000000000, 0x4318000000000000
170 .quad 0x401f800000000000, 0x401f800000000000
173 .quad 0xbff0000000000000, 0xbff0000000000000
176 .quad 0x3ff0000000000000, 0x3ff0000000000000
179 .quad 0x47f0000000000000, 0x47f0000000000000
182 .quad 0x0000000000000000, 0x0000000000000000
185 .quad 0x0000000000000000, 0x3fcf5b75f92c80dd
186 .quad 0x3fddac670561bb4f, 0x3fe4978fa3269ee1
187 .quad 0x3fe921fb54442d18, 0x3fecac7c57846f9e
188 .quad 0x3fef730bd281f69b, 0x3ff0d38f2c5ba09f
189 .quad 0x3ff1b6e192ebbe44, 0x3ff270ef55a53a25
190 .quad 0x3ff30b6d796a4da8, 0x3ff38d6a6ce13353
191 .quad 0x3ff3fc176b7a8560, 0x3ff45b54837351a0
192 .quad 0x3ff4ae10fc6589a5, 0x3ff4f68dea672617
193 .quad 0x3ff5368c951e9cfd, 0x3ff56f6f33a3e6a7
194 .quad 0x3ff5a25052114e60, 0x3ff5d013c41adabd
195 .quad 0x3ff5f97315254857, 0x3ff61f06c6a92b89
196 .quad 0x3ff6414d44094c7c, 0x3ff660b02c736a06
197 .quad 0x3ff67d8863bc99bd, 0x3ff698213a9d5053
198 .quad 0x3ff6b0bae830c070, 0x3ff6c78c7edeb195
199 .quad 0x3ff6dcc57bb565fd, 0x3ff6f08f07435fec
200 .quad 0x3ff7030cf9403197, 0x3ff7145eac2088a4
203 .quad 0x0000000000000000, 0x3c68ab6e3cf7afbd
204 .quad 0x3c7a2b7f222f65e2, 0x3c72419a87f2a458
205 .quad 0x3c81a62633145c07, 0x3c80dae13ad18a6b
206 .quad 0x3c7007887af0cbbd, 0xbc9bd0dc231bfd70
207 .quad 0x3c9b1b466a88828e, 0xbc9a66b1af5f84fb
208 .quad 0x3c96254cb03bb199, 0xbc812c77e8a80f5c
209 .quad 0xbc4441a3bd3f1084, 0x3c79e4a72eedacc4
210 .quad 0xbc93b03e8a27f555, 0x3c9934f9f2b0020e
211 .quad 0xbc996f47948a99f1, 0xbc7df6edd6f1ec3b
212 .quad 0x3c78c2d0c89de218, 0x3c9f82bba194dd5d
213 .quad 0xbc831151a43b51ca, 0xbc8487d50bceb1a5
214 .quad 0xbc9c5f60a65c7397, 0xbc7acb6afb332a0f
215 .quad 0xbc99b7bd2e1e8c9c, 0xbc9b9839085189e3
216 .quad 0xbc97d1ab82ffb70b, 0x3c99239ad620ffe2
217 .quad 0xbc929c86447928e7, 0xbc8957a7170df016
218 .quad 0xbc7cbe1896221608, 0xbc9fda5797b32a0b
221 .quad 0x4318000000000010, 0x4318000000000010
224 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18
227 .quad 0x3c91a62633145c07, 0x3c91a62633145c07
230 .quad 0x3fb2e9b9f5c4fe97, 0x3fb2e9b9f5c4fe97
231 .quad 0xbfb74257c46790cc, 0xbfb74257c46790cc
232 .quad 0x3fbc71bfeff916a0, 0x3fbc71bfeff916a0
233 .quad 0xbfc249248eef04da, 0xbfc249248eef04da
234 .quad 0x3fc999999998741e, 0x3fc999999998741e
235 .quad 0xbfd555555555554d, 0xbfd555555555554d
237 .type __svml_datan_data_internal_avx512, @object
238 .size __svml_datan_data_internal_avx512, .-__svml_datan_data_internal_avx512
242 .long 0x00000078, 0x00000000, 0x00000078, 0x00000000
243 .type .FLT_11, @object