1 /* Function cosf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
21 #include "svml_s_wrapper_impl.h"
24 ENTRY (_ZGVeN16v_cosf_knl)
25 #ifndef HAVE_AVX512_ASM_SUPPORT
26 WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
29 ALGORITHM DESCRIPTION:
31 1) Range reduction to [-Pi/2; +Pi/2] interval
32 a) We remove sign using AND operation
33 b) Add Pi/2 value to argument X for Cos to Sin transformation
34 c) Getting octant Y by 1/Pi multiplication
35 d) Add "Right Shifter" value
36 e) Treat obtained value as integer for destination sign setting.
37 Shift first bit of this value to the last (sign) position
38 f) Subtract "Right Shifter" value
39 g) Subtract 0.5 from result for octant correction
40 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
41 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
42 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
43 a) Calculate X^2 = X * X
44 b) Calculate polynomial:
45 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
46 3) Destination sign setting
47 a) Set shifted destination sign using XOR operation:
51 cfi_adjust_cfa_offset (8)
52 cfi_rel_offset (%rbp, 0)
54 cfi_def_cfa_register (%rbp)
57 movq __svml_s_trig_data@GOTPCREL(%rip), %rdx
60 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
61 X = X - Y*PI1 - Y*PI2 - Y*PI3
66 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
67 vaddps __sHalfPI(%rdx), %zmm0, %zmm2
68 vmovups __sRShifter(%rdx), %zmm3
71 1) Range reduction to [-Pi/2; +Pi/2] interval
72 c) Getting octant Y by 1/Pi multiplication
73 d) Add "Right Shifter" (0x4B000000) value
75 vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
76 vmovups __sPI1_FMA(%rdx), %zmm5
78 /* f) Subtract "Right Shifter" (0x4B000000) value */
79 vsubps %zmm3, %zmm2, %zmm4
80 vmovups __sA9_FMA(%rdx), %zmm9
82 /* Check for large and special arguments */
83 vpandd __sAbsMask(%rdx), %zmm0, %zmm1
86 e) Treat obtained value as integer for destination sign setting.
87 Shift first bit of this value to the last (sign) position (S << 31)
89 vpslld $31, %zmm2, %zmm8
90 vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
91 vpbroadcastd %eax, %zmm12{%k1}{z}
93 /* g) Subtract 0.5 from result for octant correction */
94 vsubps __sOneHalf(%rdx), %zmm4, %zmm7
95 vptestmd %zmm12, %zmm12, %k0
96 vfnmadd231ps %zmm7, %zmm5, %zmm6
98 vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
99 vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
101 /* a) Calculate X^2 = X * X */
102 vmulps %zmm7, %zmm7, %zmm10
105 3) Destination sign setting
106 a) Set shifted destination sign using XOR operation:
109 vpxord %zmm8, %zmm7, %zmm11
112 b) Calculate polynomial:
113 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
115 vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
116 vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
117 vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
118 vmulps %zmm10, %zmm9, %zmm1
119 vfmadd213ps %zmm11, %zmm11, %zmm1
127 cfi_def_cfa_register (%rsp)
129 cfi_adjust_cfa_offset (-8)
135 vmovups %zmm0, 1152(%rsp)
136 vmovups %zmm1, 1216(%rsp)
140 kmovw %k4, 1048(%rsp)
142 kmovw %k5, 1040(%rsp)
143 kmovw %k6, 1032(%rsp)
144 kmovw %k7, 1024(%rsp)
145 vmovups %zmm16, 960(%rsp)
146 vmovups %zmm17, 896(%rsp)
147 vmovups %zmm18, 832(%rsp)
148 vmovups %zmm19, 768(%rsp)
149 vmovups %zmm20, 704(%rsp)
150 vmovups %zmm21, 640(%rsp)
151 vmovups %zmm22, 576(%rsp)
152 vmovups %zmm23, 512(%rsp)
153 vmovups %zmm24, 448(%rsp)
154 vmovups %zmm25, 384(%rsp)
155 vmovups %zmm26, 320(%rsp)
156 vmovups %zmm27, 256(%rsp)
157 vmovups %zmm28, 192(%rsp)
158 vmovups %zmm29, 128(%rsp)
159 vmovups %zmm30, 64(%rsp)
160 vmovups %zmm31, (%rsp)
161 movq %rsi, 1064(%rsp)
162 movq %rdi, 1056(%rsp)
163 movq %r12, 1096(%rsp)
164 cfi_offset_rel_rsp (12, 1096)
166 movq %r13, 1088(%rsp)
167 cfi_offset_rel_rsp (13, 1088)
169 movq %r14, 1080(%rsp)
170 cfi_offset_rel_rsp (14, 1080)
172 movq %r15, 1072(%rsp)
173 cfi_offset_rel_rsp (15, 1072)
191 kmovw 1048(%rsp), %k4
192 movq 1064(%rsp), %rsi
193 kmovw 1040(%rsp), %k5
194 movq 1056(%rsp), %rdi
195 kmovw 1032(%rsp), %k6
196 movq 1096(%rsp), %r12
198 movq 1088(%rsp), %r13
200 kmovw 1024(%rsp), %k7
201 vmovups 960(%rsp), %zmm16
202 vmovups 896(%rsp), %zmm17
203 vmovups 832(%rsp), %zmm18
204 vmovups 768(%rsp), %zmm19
205 vmovups 704(%rsp), %zmm20
206 vmovups 640(%rsp), %zmm21
207 vmovups 576(%rsp), %zmm22
208 vmovups 512(%rsp), %zmm23
209 vmovups 448(%rsp), %zmm24
210 vmovups 384(%rsp), %zmm25
211 vmovups 320(%rsp), %zmm26
212 vmovups 256(%rsp), %zmm27
213 vmovups 192(%rsp), %zmm28
214 vmovups 128(%rsp), %zmm29
215 vmovups 64(%rsp), %zmm30
216 vmovups (%rsp), %zmm31
217 movq 1080(%rsp), %r14
219 movq 1072(%rsp), %r15
221 vmovups 1216(%rsp), %zmm1
227 vmovss 1156(%rsp,%r15,8), %xmm0
228 call JUMPTARGET(cosf)
229 vmovss %xmm0, 1220(%rsp,%r15,8)
234 vmovss 1152(%rsp,%r15,8), %xmm0
235 call JUMPTARGET(cosf)
236 vmovss %xmm0, 1216(%rsp,%r15,8)
239 END (_ZGVeN16v_cosf_knl)
241 ENTRY (_ZGVeN16v_cosf_skx)
242 #ifndef HAVE_AVX512_ASM_SUPPORT
243 WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
246 ALGORITHM DESCRIPTION:
248 1) Range reduction to [-Pi/2; +Pi/2] interval
249 a) We remove sign using AND operation
250 b) Add Pi/2 value to argument X for Cos to Sin transformation
251 c) Getting octant Y by 1/Pi multiplication
252 d) Add "Right Shifter" value
253 e) Treat obtained value as integer for destination sign setting.
254 Shift first bit of this value to the last (sign) position
255 f) Subtract "Right Shifter" value
256 g) Subtract 0.5 from result for octant correction
257 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
258 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
259 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
260 a) Calculate X^2 = X * X
261 b) Calculate polynomial:
262 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
263 3) Destination sign setting
264 a) Set shifted destination sign using XOR operation:
268 cfi_adjust_cfa_offset (8)
269 cfi_rel_offset (%rbp, 0)
271 cfi_def_cfa_register (%rbp)
274 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
277 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
278 X = X - Y*PI1 - Y*PI2 - Y*PI3
281 vmovups .L_2il0floatpacket.13(%rip), %zmm12
282 vmovups __sRShifter(%rax), %zmm3
283 vmovups __sPI1_FMA(%rax), %zmm5
284 vmovups __sA9_FMA(%rax), %zmm9
286 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
287 vaddps __sHalfPI(%rax), %zmm0, %zmm2
289 /* Check for large and special arguments */
290 vandps __sAbsMask(%rax), %zmm0, %zmm1
293 1) Range reduction to [-Pi/2; +Pi/2] interval
294 c) Getting octant Y by 1/Pi multiplication
295 d) Add "Right Shifter" (0x4B000000) value
297 vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
298 vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
301 e) Treat obtained value as integer for destination sign setting.
302 Shift first bit of this value to the last (sign) position (S << 31)
304 vpslld $31, %zmm2, %zmm8
306 /* f) Subtract "Right Shifter" (0x4B000000) value */
307 vsubps %zmm3, %zmm2, %zmm4
309 /* g) Subtract 0.5 from result for octant correction */
310 vsubps __sOneHalf(%rax), %zmm4, %zmm7
311 vfnmadd231ps %zmm7, %zmm5, %zmm6
312 vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
313 vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
315 /* a) Calculate X^2 = X * X */
316 vmulps %zmm7, %zmm7, %zmm10
319 3) Destination sign setting
320 a) Set shifted destination sign using XOR operation:
323 vxorps %zmm8, %zmm7, %zmm11
326 b) Calculate polynomial:
327 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
329 vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
330 vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
331 vfmadd213ps __sA3(%rax), %zmm10, %zmm9
332 vpandnd %zmm1, %zmm1, %zmm12{%k1}
333 vmulps %zmm10, %zmm9, %zmm1
334 vptestmd %zmm12, %zmm12, %k0
335 vfmadd213ps %zmm11, %zmm11, %zmm1
343 cfi_def_cfa_register (%rsp)
345 cfi_adjust_cfa_offset (-8)
351 vmovups %zmm0, 1152(%rsp)
352 vmovups %zmm1, 1216(%rsp)
357 kmovw %k4, 1048(%rsp)
358 kmovw %k5, 1040(%rsp)
359 kmovw %k6, 1032(%rsp)
360 kmovw %k7, 1024(%rsp)
361 vmovups %zmm16, 960(%rsp)
362 vmovups %zmm17, 896(%rsp)
363 vmovups %zmm18, 832(%rsp)
364 vmovups %zmm19, 768(%rsp)
365 vmovups %zmm20, 704(%rsp)
366 vmovups %zmm21, 640(%rsp)
367 vmovups %zmm22, 576(%rsp)
368 vmovups %zmm23, 512(%rsp)
369 vmovups %zmm24, 448(%rsp)
370 vmovups %zmm25, 384(%rsp)
371 vmovups %zmm26, 320(%rsp)
372 vmovups %zmm27, 256(%rsp)
373 vmovups %zmm28, 192(%rsp)
374 vmovups %zmm29, 128(%rsp)
375 vmovups %zmm30, 64(%rsp)
376 vmovups %zmm31, (%rsp)
377 movq %rsi, 1064(%rsp)
378 movq %rdi, 1056(%rsp)
379 movq %r12, 1096(%rsp)
380 cfi_offset_rel_rsp (12, 1096)
382 movq %r13, 1088(%rsp)
383 cfi_offset_rel_rsp (13, 1088)
385 movq %r14, 1080(%rsp)
386 cfi_offset_rel_rsp (14, 1080)
388 movq %r15, 1072(%rsp)
389 cfi_offset_rel_rsp (15, 1072)
404 kmovw 1048(%rsp), %k4
405 kmovw 1040(%rsp), %k5
406 kmovw 1032(%rsp), %k6
407 kmovw 1024(%rsp), %k7
408 vmovups 960(%rsp), %zmm16
409 vmovups 896(%rsp), %zmm17
410 vmovups 832(%rsp), %zmm18
411 vmovups 768(%rsp), %zmm19
412 vmovups 704(%rsp), %zmm20
413 vmovups 640(%rsp), %zmm21
414 vmovups 576(%rsp), %zmm22
415 vmovups 512(%rsp), %zmm23
416 vmovups 448(%rsp), %zmm24
417 vmovups 384(%rsp), %zmm25
418 vmovups 320(%rsp), %zmm26
419 vmovups 256(%rsp), %zmm27
420 vmovups 192(%rsp), %zmm28
421 vmovups 128(%rsp), %zmm29
422 vmovups 64(%rsp), %zmm30
423 vmovups (%rsp), %zmm31
424 vmovups 1216(%rsp), %zmm1
425 movq 1064(%rsp), %rsi
426 movq 1056(%rsp), %rdi
427 movq 1096(%rsp), %r12
429 movq 1088(%rsp), %r13
431 movq 1080(%rsp), %r14
433 movq 1072(%rsp), %r15
440 vmovss 1156(%rsp,%r15,8), %xmm0
442 vmovss 1156(%rsp,%r15,8), %xmm0
443 call JUMPTARGET(cosf)
444 vmovss %xmm0, 1220(%rsp,%r15,8)
448 vmovss 1152(%rsp,%r15,8), %xmm0
450 vmovss 1152(%rsp,%r15,8), %xmm0
451 call JUMPTARGET(cosf)
452 vmovss %xmm0, 1216(%rsp,%r15,8)
455 END (_ZGVeN16v_cosf_skx)
457 .section .rodata, "a"
458 .L_2il0floatpacket.13:
459 .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
460 .type .L_2il0floatpacket.13,@object