1 /* Function sinf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_s_trig_data.h"
21 #include "svml_s_wrapper_impl.h"
23 .section .text.evex512, "ax", @progbits
24 ENTRY(_ZGVeN16v_sinf_knl)
26 ALGORITHM DESCRIPTION:
28 1) Range reduction to [-Pi/2; +Pi/2] interval
29 a) Grab sign from source argument and save it.
30 b) Remove sign using AND operation
31 c) Getting octant Y by 1/Pi multiplication
32 d) Add "Right Shifter" value
33 e) Treat obtained value as integer for destination sign setting.
34 Shift first bit of this value to the last (sign) position
35 f) Change destination sign if source sign is negative
37 g) Subtract "Right Shifter" value
38 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
39 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
40 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
41 a) Calculate X^2 = X * X
42 b) Calculate polynomial:
43 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
44 3) Destination sign setting
45 a) Set shifted destination sign using XOR operation:
49 cfi_adjust_cfa_offset (8)
50 cfi_rel_offset (%rbp, 0)
52 cfi_def_cfa_register (%rbp)
55 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
57 /* Check for large and special values */
59 vmovups __sAbsMask(%rax), %zmm4
60 vmovups __sInvPI(%rax), %zmm1
62 /* b) Remove sign using AND operation */
63 vpandd %zmm4, %zmm0, %zmm12
64 vmovups __sPI1_FMA(%rax), %zmm2
65 vmovups __sA9(%rax), %zmm7
68 f) Change destination sign if source sign is negative
71 vpandnd %zmm0, %zmm4, %zmm11
74 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
75 X = X - Y*PI1 - Y*PI2 - Y*PI3;
80 c) Getting octant Y by 1/Pi multiplication
81 d) Add "Right Shifter" value
83 vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
84 vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1
85 vpbroadcastd %edx, %zmm13{%k1}{z}
87 /* g) Subtract "Right Shifter" value */
88 vsubps __sRShifter(%rax), %zmm1, %zmm5
91 e) Treat obtained value as integer for destination sign setting.
92 Shift first bit of this value to the last (sign) position
94 vpslld $31, %zmm1, %zmm6
95 vptestmd %zmm13, %zmm13, %k0
96 vfnmadd231ps %zmm5, %zmm2, %zmm3
98 vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
99 vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
102 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
103 a) Calculate X^2 = X * X
104 b) Calculate polynomial:
105 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
107 vmulps %zmm5, %zmm5, %zmm8
108 vpxord %zmm6, %zmm5, %zmm9
109 vfmadd213ps __sA7(%rax), %zmm8, %zmm7
110 vfmadd213ps __sA5(%rax), %zmm8, %zmm7
111 vfmadd213ps __sA3(%rax), %zmm8, %zmm7
112 vmulps %zmm8, %zmm7, %zmm10
113 vfmadd213ps %zmm9, %zmm9, %zmm10
116 3) Destination sign setting
117 a) Set shifted destination sign using XOR operation:
120 vpxord %zmm11, %zmm10, %zmm1
128 cfi_def_cfa_register (%rsp)
130 cfi_adjust_cfa_offset (-8)
136 vmovups %zmm0, 1152(%rsp)
137 vmovups %zmm1, 1216(%rsp)
141 kmovw %k4, 1048(%rsp)
143 kmovw %k5, 1040(%rsp)
144 kmovw %k6, 1032(%rsp)
145 kmovw %k7, 1024(%rsp)
146 vmovups %zmm16, 960(%rsp)
147 vmovups %zmm17, 896(%rsp)
148 vmovups %zmm18, 832(%rsp)
149 vmovups %zmm19, 768(%rsp)
150 vmovups %zmm20, 704(%rsp)
151 vmovups %zmm21, 640(%rsp)
152 vmovups %zmm22, 576(%rsp)
153 vmovups %zmm23, 512(%rsp)
154 vmovups %zmm24, 448(%rsp)
155 vmovups %zmm25, 384(%rsp)
156 vmovups %zmm26, 320(%rsp)
157 vmovups %zmm27, 256(%rsp)
158 vmovups %zmm28, 192(%rsp)
159 vmovups %zmm29, 128(%rsp)
160 vmovups %zmm30, 64(%rsp)
161 vmovups %zmm31, (%rsp)
162 movq %rsi, 1064(%rsp)
163 movq %rdi, 1056(%rsp)
164 movq %r12, 1096(%rsp)
165 cfi_offset_rel_rsp (12, 1096)
167 movq %r13, 1088(%rsp)
168 cfi_offset_rel_rsp (13, 1088)
170 movq %r14, 1080(%rsp)
171 cfi_offset_rel_rsp (14, 1080)
173 movq %r15, 1072(%rsp)
174 cfi_offset_rel_rsp (15, 1072)
192 kmovw 1048(%rsp), %k4
193 movq 1064(%rsp), %rsi
194 kmovw 1040(%rsp), %k5
195 movq 1056(%rsp), %rdi
196 kmovw 1032(%rsp), %k6
197 movq 1096(%rsp), %r12
199 movq 1088(%rsp), %r13
201 kmovw 1024(%rsp), %k7
202 vmovups 960(%rsp), %zmm16
203 vmovups 896(%rsp), %zmm17
204 vmovups 832(%rsp), %zmm18
205 vmovups 768(%rsp), %zmm19
206 vmovups 704(%rsp), %zmm20
207 vmovups 640(%rsp), %zmm21
208 vmovups 576(%rsp), %zmm22
209 vmovups 512(%rsp), %zmm23
210 vmovups 448(%rsp), %zmm24
211 vmovups 384(%rsp), %zmm25
212 vmovups 320(%rsp), %zmm26
213 vmovups 256(%rsp), %zmm27
214 vmovups 192(%rsp), %zmm28
215 vmovups 128(%rsp), %zmm29
216 vmovups 64(%rsp), %zmm30
217 vmovups (%rsp), %zmm31
218 movq 1080(%rsp), %r14
220 movq 1072(%rsp), %r15
222 vmovups 1216(%rsp), %zmm1
228 vmovss 1156(%rsp,%r15,8), %xmm0
229 call JUMPTARGET(sinf)
230 vmovss %xmm0, 1220(%rsp,%r15,8)
235 vmovss 1152(%rsp,%r15,8), %xmm0
236 call JUMPTARGET(sinf)
237 vmovss %xmm0, 1216(%rsp,%r15,8)
239 END(_ZGVeN16v_sinf_knl)
241 ENTRY (_ZGVeN16v_sinf_skx)
243 ALGORITHM DESCRIPTION:
245 1) Range reduction to [-Pi/2; +Pi/2] interval
246 a) Grab sign from source argument and save it.
247 b) Remove sign using AND operation
248 c) Getting octant Y by 1/Pi multiplication
249 d) Add "Right Shifter" value
250 e) Treat obtained value as integer for destination sign setting.
251 Shift first bit of this value to the last (sign) position
252 f) Change destination sign if source sign is negative
254 g) Subtract "Right Shifter" value
255 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
256 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
257 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
258 a) Calculate X^2 = X * X
259 b) Calculate polynomial:
260 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
261 3) Destination sign setting
262 a) Set shifted destination sign using XOR operation:
267 cfi_adjust_cfa_offset (8)
268 cfi_rel_offset (%rbp, 0)
270 cfi_def_cfa_register (%rbp)
273 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
275 /* Check for large and special values */
276 vpternlogd $0xff, %zmm14, %zmm14, %zmm14
277 vmovups __sAbsMask(%rax), %zmm5
278 vmovups __sInvPI(%rax), %zmm1
279 vmovups __sRShifter(%rax), %zmm2
280 vmovups __sPI1_FMA(%rax), %zmm3
281 vmovups __sA9(%rax), %zmm8
283 /* b) Remove sign using AND operation */
284 vandps %zmm5, %zmm0, %zmm13
287 f) Change destination sign if source sign is negative
290 vandnps %zmm0, %zmm5, %zmm12
293 c) Getting octant Y by 1/Pi multiplication
294 d) Add "Right Shifter" value
296 vfmadd213ps %zmm2, %zmm13, %zmm1
297 vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1
300 e) Treat obtained value as integer for destination sign setting.
301 Shift first bit of this value to the last (sign) position
303 vpslld $31, %zmm1, %zmm7
305 /* g) Subtract "Right Shifter" value */
306 vsubps %zmm2, %zmm1, %zmm6
309 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
310 X = X - Y*PI1 - Y*PI2 - Y*PI3;
312 vmovaps %zmm13, %zmm4
313 vfnmadd231ps %zmm6, %zmm3, %zmm4
314 vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
315 vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
318 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
319 a) Calculate X^2 = X * X
320 b) Calculate polynomial:
321 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
323 vmulps %zmm6, %zmm6, %zmm9
324 vxorps %zmm7, %zmm6, %zmm10
325 vfmadd213ps __sA7(%rax), %zmm9, %zmm8
326 vfmadd213ps __sA5(%rax), %zmm9, %zmm8
327 vfmadd213ps __sA3(%rax), %zmm9, %zmm8
328 vmulps %zmm9, %zmm8, %zmm11
329 vfmadd213ps %zmm10, %zmm10, %zmm11
332 3) Destination sign setting
333 a) Set shifted destination sign using XOR operation:
336 vxorps %zmm12, %zmm11, %zmm1
337 vpandnd %zmm13, %zmm13, %zmm14{%k1}
338 vptestmd %zmm14, %zmm14, %k0
347 cfi_def_cfa_register (%rsp)
349 cfi_adjust_cfa_offset (-8)
355 vmovups %zmm0, 1152(%rsp)
356 vmovups %zmm1, 1216(%rsp)
361 kmovw %k4, 1048(%rsp)
362 kmovw %k5, 1040(%rsp)
363 kmovw %k6, 1032(%rsp)
364 kmovw %k7, 1024(%rsp)
365 vmovups %zmm16, 960(%rsp)
366 vmovups %zmm17, 896(%rsp)
367 vmovups %zmm18, 832(%rsp)
368 vmovups %zmm19, 768(%rsp)
369 vmovups %zmm20, 704(%rsp)
370 vmovups %zmm21, 640(%rsp)
371 vmovups %zmm22, 576(%rsp)
372 vmovups %zmm23, 512(%rsp)
373 vmovups %zmm24, 448(%rsp)
374 vmovups %zmm25, 384(%rsp)
375 vmovups %zmm26, 320(%rsp)
376 vmovups %zmm27, 256(%rsp)
377 vmovups %zmm28, 192(%rsp)
378 vmovups %zmm29, 128(%rsp)
379 vmovups %zmm30, 64(%rsp)
380 vmovups %zmm31, (%rsp)
381 movq %rsi, 1064(%rsp)
382 movq %rdi, 1056(%rsp)
383 movq %r12, 1096(%rsp)
384 cfi_offset_rel_rsp (12, 1096)
386 movq %r13, 1088(%rsp)
387 cfi_offset_rel_rsp (13, 1088)
389 movq %r14, 1080(%rsp)
390 cfi_offset_rel_rsp (14, 1080)
392 movq %r15, 1072(%rsp)
393 cfi_offset_rel_rsp (15, 1072)
411 kmovw 1048(%rsp), %k4
412 kmovw 1040(%rsp), %k5
413 kmovw 1032(%rsp), %k6
414 kmovw 1024(%rsp), %k7
415 vmovups 960(%rsp), %zmm16
416 vmovups 896(%rsp), %zmm17
417 vmovups 832(%rsp), %zmm18
418 vmovups 768(%rsp), %zmm19
419 vmovups 704(%rsp), %zmm20
420 vmovups 640(%rsp), %zmm21
421 vmovups 576(%rsp), %zmm22
422 vmovups 512(%rsp), %zmm23
423 vmovups 448(%rsp), %zmm24
424 vmovups 384(%rsp), %zmm25
425 vmovups 320(%rsp), %zmm26
426 vmovups 256(%rsp), %zmm27
427 vmovups 192(%rsp), %zmm28
428 vmovups 128(%rsp), %zmm29
429 vmovups 64(%rsp), %zmm30
430 vmovups (%rsp), %zmm31
431 vmovups 1216(%rsp), %zmm1
432 movq 1064(%rsp), %rsi
433 movq 1056(%rsp), %rdi
434 movq 1096(%rsp), %r12
436 movq 1088(%rsp), %r13
438 movq 1080(%rsp), %r14
440 movq 1072(%rsp), %r15
447 vmovss 1156(%rsp,%r15,8), %xmm0
449 vmovss 1156(%rsp,%r15,8), %xmm0
451 call JUMPTARGET(sinf)
453 vmovss %xmm0, 1220(%rsp,%r15,8)
458 vmovss 1152(%rsp,%r15,8), %xmm0
460 vmovss 1152(%rsp,%r15,8), %xmm0
462 call JUMPTARGET(sinf)
464 vmovss %xmm0, 1216(%rsp,%r15,8)
466 END (_ZGVeN16v_sinf_skx)