1 /* Function atan2f vectorized with SSE4.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
21 * For 0.0 <= x <= 7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
22 * For 7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
23 * For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
24 * For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
25 * For 39.0/16.0 <= x <= inf : atan(x) = atan(inf) + atan(s), where s=-1.0/x
26 * Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
31 /* Offsets for data table __svml_satan2_data_internal
47 #define iCHK_WORK_SUB 224
48 #define iCHK_WORK_CMP 240
52 .section .text.sse4, "ax", @progbits
53 ENTRY(_ZGVbN4vv_atan2f_sse4)
55 cfi_def_cfa_offset(96)
59 * #define NO_VECTOR_ZERO_ATAN2_ARGS
63 * The end of declarations
67 movups sABS_MASK+__svml_satan2_data_internal(%rip), %xmm10
76 * 1) If y<x then a= y, b=x, PIO2=0
77 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
79 movups sSIGN_MASK+__svml_satan2_data_internal(%rip), %xmm6
95 /* Testing on working interval. */
96 movdqu iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %xmm14
102 movdqu iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %xmm2
105 pcmpeqd %xmm2, %xmm15
106 pcmpgtd %xmm2, %xmm14
117 movaps %xmm14, %xmm15
121 movups sPC8+__svml_satan2_data_internal(%rip), %xmm2
125 movups sPC7+__svml_satan2_data_internal(%rip), %xmm3
128 addps sPC6+__svml_satan2_data_internal(%rip), %xmm2
130 addps sPC5+__svml_satan2_data_internal(%rip), %xmm3
132 addps sPC4+__svml_satan2_data_internal(%rip), %xmm2
134 addps sPC3+__svml_satan2_data_internal(%rip), %xmm3
136 addps sPC2+__svml_satan2_data_internal(%rip), %xmm2
138 addps sPC1+__svml_satan2_data_internal(%rip), %xmm3
140 addps sPC0+__svml_satan2_data_internal(%rip), %xmm15
142 /* if x<0, sPI = Pi, else sPI =0 */
143 movups __svml_satan2_data_internal(%rip), %xmm5
145 andnps sPIO2+__svml_satan2_data_internal(%rip), %xmm7
149 /* Reconstruction. */
151 andps sPI+__svml_satan2_data_internal(%rip), %xmm4
157 /* Special branch for fast (vector) processing of zero arguments */
160 /* Go to auxilary branch */
162 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
164 /* Return from auxilary branch
165 * for out of main path inputs
168 L(AUX_BRANCH_RETURN):
170 * Special branch for fast (vector) processing of zero arguments
171 * The end of implementation
175 /* Go to special inputs processing branch */
176 jne L(SPECIAL_VALUES_BRANCH)
177 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
180 * and exit the function
185 cfi_def_cfa_offset(8)
187 cfi_def_cfa_offset(96)
193 L(SPECIAL_VALUES_BRANCH):
194 movups %xmm12, 32(%rsp)
195 movups %xmm13, 48(%rsp)
196 movups %xmm0, 64(%rsp)
197 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
207 # LOE rbx rbp r15 r12d r13d
216 /* Call scalar math function */
217 jc L(SCALAR_MATH_CALL)
218 # LOE rbx rbp r15 r12d r13d
224 L(SPECIAL_VALUES_LOOP):
228 /* Check bits in range mask */
229 jl L(RANGEMASK_CHECK)
230 # LOE rbx rbp r15 r12d r13d
238 movups 64(%rsp), %xmm0
245 # LOE rbx rbp r12 r13 r14 r15 xmm0
247 /* Scalar math fucntion call
248 * to process special input
253 movss 32(%rsp, %r14, 4), %xmm0
254 movss 48(%rsp, %r14, 4), %xmm1
256 # LOE rbx rbp r14 r15 r12d r13d xmm0
258 movss %xmm0, 64(%rsp, %r14, 4)
260 /* Process special inputs in loop */
261 jmp L(SPECIAL_VALUES_LOOP)
265 # LOE rbx rbp r15 r12d r13d
268 * for out of main path inputs
272 /* Check if both X & Y are not NaNs: iXYnotNAN */
275 cmpordps %xmm13, %xmm3
276 cmpordps %xmm12, %xmm2
279 * Path for zero arguments (at least one of both)
280 * Check if both args are zeros (den. is zero)
284 /* Check if at least on of Y or Y is zero: iAXAYZERO */
285 pcmpeqd %xmm5, %xmm11
286 pcmpeqd %xmm5, %xmm10
290 /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
293 /* Exclude from previous callout mask zero (and not NaN) arguments */
294 movaps %xmm11, %xmm10
297 /* Set sPIO2 to zero if den. is zero */
303 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
304 pcmpgtd %xmm13, %xmm5
308 /* Merge results from main and spec path */
313 movmskps %xmm10, %edx
320 /* Return to main vector processing path */
321 jmp L(AUX_BRANCH_RETURN)
322 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
323 END(_ZGVbN4vv_atan2f_sse4)
325 .section .rodata, "a"
328 #ifdef __svml_satan2_data_internal_typedef
329 typedef unsigned int VUINT32;
331 __declspec(align(16)) VUINT32 sZERO[4][1];
332 __declspec(align(16)) VUINT32 sSIGN_MASK[4][1];
333 __declspec(align(16)) VUINT32 sABS_MASK[4][1];
334 __declspec(align(16)) VUINT32 sPIO2[4][1];
335 __declspec(align(16)) VUINT32 sPI[4][1];
336 __declspec(align(16)) VUINT32 sPC8[4][1];
337 __declspec(align(16)) VUINT32 sPC7[4][1];
338 __declspec(align(16)) VUINT32 sPC6[4][1];
339 __declspec(align(16)) VUINT32 sPC5[4][1];
340 __declspec(align(16)) VUINT32 sPC4[4][1];
341 __declspec(align(16)) VUINT32 sPC3[4][1];
342 __declspec(align(16)) VUINT32 sPC2[4][1];
343 __declspec(align(16)) VUINT32 sPC1[4][1];
344 __declspec(align(16)) VUINT32 sPC0[4][1];
345 __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1];
346 __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1];
347 } __svml_satan2_data_internal;
349 __svml_satan2_data_internal:
350 .long 0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO
352 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK
354 .long 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK
356 .long 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2
358 .long 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI
360 .long 0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08
362 .long 0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07
364 .long 0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06
366 .long 0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05
368 .long 0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04
370 .long 0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03
372 .long 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02
374 .long 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01
376 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00
378 .long 0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB
380 .long 0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP
382 .type __svml_satan2_data_internal, @object
383 .size __svml_satan2_data_internal, .-__svml_satan2_data_internal