1 /* Function sincosf vectorized with SSE4.
2 Copyright (C) 2014-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
23 ENTRY (_ZGVbN4vl4l4_sincosf_sse4)
25 ALGORITHM DESCRIPTION:
27 1) Range reduction to [-Pi/4; +Pi/4] interval
28 a) Grab sign from source argument and save it.
29 b) Remove sign using AND operation
30 c) Getting octant Y by 2/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer S for destination sign setting.
33 SS = ((S-S&1)&2)<<30; For sin part
34 SC = ((S+S&1)&2)<<30; For cos part
35 f) Change destination sign if source sign is negative
37 g) Subtract "Right Shifter" (0x4B000000) value
38 h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate 2 polynomials for sin and cos:
43 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
44 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4))));
45 c) Swap RS & RC if first bit of obtained value after
46 Right Shifting is set to 1. Using And, Andnot & Or operations.
47 3) Destination sign setting
48 a) Set shifted destination sign using XOR operation:
50 R2 = XOR( RC, SC ). */
53 cfi_adjust_cfa_offset (8)
54 cfi_rel_offset (%rbp, 0)
56 cfi_def_cfa_register (%rbp)
59 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
60 movups %xmm12, 176(%rsp)
61 movups %xmm9, 160(%rsp)
62 movups __sAbsMask(%rax), %xmm12
64 /* Absolute argument computation */
67 movups __sInvPI(%rax), %xmm7
70 /* c) Getting octant Y by 2/Pi multiplication
71 d) Add "Right Shifter" value. */
73 movups %xmm10, 144(%rsp)
74 movups __sPI1(%rax), %xmm10
76 /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts:
77 X = X - Y*PI1 - Y*PI2 - Y*PI3. */
79 addps __sRShifter(%rax), %xmm7
81 /* e) Treat obtained value as integer S for destination sign setting */
84 /* g) Subtract "Right Shifter" (0x4B000000) value */
85 subps __sRShifter(%rax), %xmm7
88 movups __sPI2(%rax), %xmm6
89 movups %xmm13, 112(%rsp)
94 movups __sSignMask(%rax), %xmm3
96 movups __sOneHalf(%rax), %xmm4
98 cmpnleps __sRangeReductionVal(%rax), %xmm5
103 /* Result sign calculations */
107 /* Add correction term 0.5 for cos() part */
113 movups __sPI3(%rax), %xmm10
120 movups __sPI4(%rax), %xmm6
131 movups __sA9(%rax), %xmm7
133 /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval)
134 a) Calculate X^2 = X * X
135 b) Calculate 2 polynomials for sin and cos:
136 RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3))));
137 RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */
141 addps __sA7(%rax), %xmm3
142 addps __sA7(%rax), %xmm7
145 addps __sA5(%rax), %xmm3
146 addps __sA5(%rax), %xmm7
149 addps __sA3(%rax), %xmm3
150 addps __sA3(%rax), %xmm7
163 movups 160(%rsp), %xmm9
164 movaps %xmm13, (%rdi)
165 movups 144(%rsp), %xmm10
166 movups 176(%rsp), %xmm12
167 movups 112(%rsp), %xmm13
170 cfi_def_cfa_register (%rsp)
172 cfi_adjust_cfa_offset (-8)
178 movups %xmm0, 128(%rsp)
179 movups %xmm13, 192(%rsp)
180 movups %xmm1, 256(%rsp)
185 movups %xmm8, 48(%rsp)
186 movups %xmm11, 32(%rsp)
187 movups %xmm14, 16(%rsp)
188 movups %xmm15, (%rsp)
191 cfi_offset_rel_rsp (12, 104)
194 cfi_offset_rel_rsp (13, 96)
197 cfi_offset_rel_rsp (14, 88)
200 cfi_offset_rel_rsp (15, 80)
220 movups 48(%rsp), %xmm8
222 movups 32(%rsp), %xmm11
223 movups 16(%rsp), %xmm14
224 movups (%rsp), %xmm15
235 movups 192(%rsp), %xmm13
236 movups 256(%rsp), %xmm1
242 movss 132(%rsp,%r15,8), %xmm0
244 call JUMPTARGET(sinf)
246 movss %xmm0, 196(%rsp,%r15,8)
247 movss 132(%rsp,%r15,8), %xmm0
249 call JUMPTARGET(cosf)
251 movss %xmm0, 260(%rsp,%r15,8)
256 movss 128(%rsp,%r15,8), %xmm0
258 call JUMPTARGET(sinf)
260 movss %xmm0, 192(%rsp,%r15,8)
261 movss 128(%rsp,%r15,8), %xmm0
263 call JUMPTARGET(cosf)
265 movss %xmm0, 256(%rsp,%r15,8)
268 END (_ZGVbN4vl4l4_sincosf_sse4)
269 libmvec_hidden_def(_ZGVbN4vl4l4_sincosf_sse4)
271 /* vvv version implemented with wrapper to vl4l4 variant. */
272 ENTRY (_ZGVbN4vvv_sincosf_sse4)
275 .cfi_def_cfa_offset 112
276 movdqu %xmm1, 32(%rsp)
278 movdqu %xmm2, 48(%rdi)
280 movdqu %xmm3, 48(%rsi)
281 movdqu %xmm4, 64(%rsi)
282 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
308 .cfi_def_cfa_offset 8
312 .cfi_def_cfa_offset 80
314 movaps %xmm1, 16(%esp)
317 call HIDDEN_JUMPTARGET(_ZGVbN4vl4l4_sincosf_sse4)
319 movss 32(%esp), %xmm0
322 movss 36(%esp), %xmm0
325 movss 40(%esp), %xmm0
328 movss 44(%esp), %xmm0
331 movss 48(%esp), %xmm0
334 movss 52(%esp), %xmm0
337 movss 56(%esp), %xmm0
340 movss 60(%esp), %xmm0
343 .cfi_def_cfa_offset 8
346 END (_ZGVbN4vvv_sincosf_sse4)