1 /* Function cosf vectorized with SSE4.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
23 ENTRY (_ZGVbN4v_cosf_sse4)
25 ALGORITHM DESCRIPTION:
27 1) Range reduction to [-Pi/2; +Pi/2] interval
28 a) We remove sign using AND operation
29 b) Add Pi/2 value to argument X for Cos to Sin transformation
30 c) Getting octant Y by 1/Pi multiplication
31 d) Add "Right Shifter" value
32 e) Treat obtained value as integer for destination sign setting.
33 Shift first bit of this value to the last (sign) position
34 f) Subtract "Right Shifter" value
35 g) Subtract 0.5 from result for octant correction
36 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
37 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
38 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
39 a) Calculate X^2 = X * X
40 b) Calculate polynomial:
41 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
42 3) Destination sign setting
43 a) Set shifted destination sign using XOR operation:
47 cfi_adjust_cfa_offset (8)
48 cfi_rel_offset (%rbp, 0)
50 cfi_def_cfa_register (%rbp)
54 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
55 movups __sHalfPI(%rax), %xmm1
56 movups __sRShifter(%rax), %xmm5
58 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
62 1) Range reduction to [-Pi/2; +Pi/2] interval
63 c) Getting octant Y by 1/Pi multiplication
64 d) Add "Right Shifter" (0x4B000000) value
66 mulps __sInvPI(%rax), %xmm1
67 movups __sPI1(%rax), %xmm6
71 e) Treat obtained value as integer for destination sign setting.
72 Shift first bit of this value to the last (sign) position (S << 31)
76 /* f) Subtract "Right Shifter" (0x4B000000) value */
78 movups __sPI2(%rax), %xmm7
80 movups __sPI3(%rax), %xmm5
81 movups __sAbsMask(%rax), %xmm3
83 /* Check for large and special arguments */
86 /* g) Subtract 0.5 from result for octant correction */
87 subps __sOneHalf(%rax), %xmm1
88 cmpnleps __sRangeReductionVal(%rax), %xmm3
91 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
92 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
99 movups __sPI4(%rax), %xmm6
105 /* a) Calculate X^2 = X * X */
110 3) Destination sign setting
111 a) Set shifted destination sign using XOR operation:
115 movups __sA9(%rax), %xmm2
118 b) Calculate polynomial:
119 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
122 addps __sA7(%rax), %xmm2
124 addps __sA5(%rax), %xmm2
126 addps __sA3(%rax), %xmm2
136 cfi_def_cfa_register (%rsp)
138 cfi_adjust_cfa_offset (-8)
144 movups %xmm4, 192(%rsp)
145 movups %xmm0, 256(%rsp)
150 movups %xmm8, 112(%rsp)
151 movups %xmm9, 96(%rsp)
152 movups %xmm10, 80(%rsp)
153 movups %xmm11, 64(%rsp)
154 movups %xmm12, 48(%rsp)
155 movups %xmm13, 32(%rsp)
156 movups %xmm14, 16(%rsp)
157 movups %xmm15, (%rsp)
161 cfi_offset_rel_rsp (12, 168)
164 cfi_offset_rel_rsp (13, 160)
167 cfi_offset_rel_rsp (14, 152)
170 cfi_offset_rel_rsp (15, 144)
188 movups 112(%rsp), %xmm8
189 movups 96(%rsp), %xmm9
190 movups 80(%rsp), %xmm10
191 movups 64(%rsp), %xmm11
192 movups 48(%rsp), %xmm12
193 movups 32(%rsp), %xmm13
194 movups 16(%rsp), %xmm14
195 movups (%rsp), %xmm15
206 movups 256(%rsp), %xmm0
212 movss 196(%rsp,%r15,8), %xmm0
214 call JUMPTARGET(cosf)
216 movss %xmm0, 260(%rsp,%r15,8)
221 movss 192(%rsp,%r15,8), %xmm0
223 call JUMPTARGET(cosf)
225 movss %xmm0, 256(%rsp,%r15,8)
227 END (_ZGVbN4v_cosf_sse4)