Update copyright dates with scripts/update-copyrights
[glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_log1p8_core_avx512.S
blob283c40b689ca7550a40f752ae556d76672d4fef6
1 /* Function log1p vectorized with AVX-512.
2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    https://www.gnu.org/licenses/.  */
20  * ALGORITHM DESCRIPTION:
21  *
22  *    1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23  *    Get short reciprocal approximation Rcp ~ 1/xh
24  *    R = (Rcp*xh - 1.0) + Rcp*xl
25  *    log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26  *       log(Rcp) is tabulated
27  *
28  *
29  */
31 /* Offsets for data table __svml_dlog1p_data_internal_avx512
32  */
33 #define Log_tbl                         0
34 #define One                             128
35 #define SgnMask                         192
36 #define C075                            256
37 #define poly_coeff9                     320
38 #define poly_coeff8                     384
39 #define poly_coeff7                     448
40 #define poly_coeff6                     512
41 #define poly_coeff5                     576
42 #define poly_coeff4                     640
43 #define poly_coeff3                     704
44 #define poly_coeff2                     768
45 #define L2                              832
47 #include <sysdep.h>
49         .section .text.evex512, "ax", @progbits
50 ENTRY(_ZGVeN8v_log1p_skx)
51         pushq   %rbp
52         cfi_def_cfa_offset(16)
53         movq    %rsp, %rbp
54         cfi_def_cfa(6, 16)
55         cfi_offset(6, -16)
56         andq    $-64, %rsp
57         subq    $192, %rsp
58         vmovups One+__svml_dlog1p_data_internal_avx512(%rip), %zmm7
59         vmovups SgnMask+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
60         vmovaps %zmm0, %zmm9
61         vaddpd  {rn-sae}, %zmm9, %zmm7, %zmm11
62         vandpd  %zmm14, %zmm9, %zmm8
64         /* compute 1+x as high, low parts */
65         vmaxpd  {sae}, %zmm9, %zmm7, %zmm10
66         vminpd  {sae}, %zmm9, %zmm7, %zmm12
68         /* GetMant(x), normalized to [1, 2) for x>=0, NaN for x<0 */
69         vgetmantpd $8, {sae}, %zmm11, %zmm6
71         /* GetExp(x) */
72         vgetexppd {sae}, %zmm11, %zmm5
73         vsubpd  {rn-sae}, %zmm10, %zmm11, %zmm13
75         /* DblRcp ~ 1/Mantissa */
76         vrcp14pd %zmm6, %zmm15
78         /* Start polynomial evaluation */
79         vmovups poly_coeff9+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
80         vmovups poly_coeff7+__svml_dlog1p_data_internal_avx512(%rip), %zmm11
82         /* Xl */
83         vsubpd  {rn-sae}, %zmm13, %zmm12, %zmm2
84         vxorpd  %zmm14, %zmm5, %zmm3
86         /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
87         vrndscalepd $88, {sae}, %zmm15, %zmm4
88         vmovups poly_coeff5+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
89         vmovups poly_coeff6+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
90         vmovups poly_coeff3+__svml_dlog1p_data_internal_avx512(%rip), %zmm13
92         /* Xl*2^(-Expon) */
93         vscalefpd {rn-sae}, %zmm3, %zmm2, %zmm1
95         /* Reduced argument: R = DblRcp*(Mantissa+Xl) - 1 */
96         vfmsub213pd {rn-sae}, %zmm7, %zmm4, %zmm6
97         vmovups __svml_dlog1p_data_internal_avx512(%rip), %zmm3
99         /*
100          * Table lookup
101          * Prepare exponent correction: DblRcp<0.75?
102          */
103         vmovups C075+__svml_dlog1p_data_internal_avx512(%rip), %zmm2
105         /* Prepare table index */
106         vpsrlq  $48, %zmm4, %zmm0
107         vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm6
108         vmovups poly_coeff8+__svml_dlog1p_data_internal_avx512(%rip), %zmm1
109         vcmppd  $17, {sae}, %zmm2, %zmm4, %k1
110         vcmppd  $4, {sae}, %zmm6, %zmm6, %k0
111         vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
112         vmovups poly_coeff4+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
113         vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
114         vmovups L2+__svml_dlog1p_data_internal_avx512(%rip), %zmm4
115         vpermt2pd Log_tbl+64+__svml_dlog1p_data_internal_avx512(%rip), %zmm0, %zmm3
117         /* add 1 to Expon if DblRcp<0.75 */
118         vaddpd  {rn-sae}, %zmm7, %zmm5, %zmm5{%k1}
120         /* R^2 */
121         vmulpd  {rn-sae}, %zmm6, %zmm6, %zmm0
122         vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm10
123         vmovups poly_coeff2+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
124         vmulpd  {rn-sae}, %zmm0, %zmm0, %zmm15
125         vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
126         vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
127         kmovw   %k0, %edx
128         vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm10
130         /* polynomial */
131         vfmadd213pd {rn-sae}, %zmm10, %zmm15, %zmm1
132         vfmadd213pd {rn-sae}, %zmm6, %zmm0, %zmm1
133         vaddpd  {rn-sae}, %zmm1, %zmm3, %zmm6
134         vfmadd213pd {rn-sae}, %zmm6, %zmm4, %zmm5
135         vorpd   %zmm8, %zmm5, %zmm0
136         testl   %edx, %edx
138         /* Go to special inputs processing branch */
139         jne     L(SPECIAL_VALUES_BRANCH)
140         # LOE rbx r12 r13 r14 r15 edx zmm0 zmm9
142         /* Restore registers
143          * and exit the function
144          */
146 L(EXIT):
147         movq    %rbp, %rsp
148         popq    %rbp
149         cfi_def_cfa(7, 8)
150         cfi_restore(6)
151         ret
152         cfi_def_cfa(6, 16)
153         cfi_offset(6, -16)
155         /* Branch to process
156          * special inputs
157          */
159 L(SPECIAL_VALUES_BRANCH):
160         vmovups %zmm9, 64(%rsp)
161         vmovups %zmm0, 128(%rsp)
162         # LOE rbx r12 r13 r14 r15 edx zmm0
164         xorl    %eax, %eax
165         # LOE rbx r12 r13 r14 r15 eax edx
167         vzeroupper
168         movq    %r12, 16(%rsp)
169         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
170         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
171         movl    %eax, %r12d
172         movq    %r13, 8(%rsp)
173         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
174         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
175         movl    %edx, %r13d
176         movq    %r14, (%rsp)
177         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
178         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
179         # LOE rbx r15 r12d r13d
181         /* Range mask
182          * bits check
183          */
185 L(RANGEMASK_CHECK):
186         btl     %r12d, %r13d
188         /* Call scalar math function */
189         jc      L(SCALAR_MATH_CALL)
190         # LOE rbx r15 r12d r13d
192         /* Special inputs
193          * processing loop
194          */
196 L(SPECIAL_VALUES_LOOP):
197         incl    %r12d
198         cmpl    $8, %r12d
200         /* Check bits in range mask */
201         jl      L(RANGEMASK_CHECK)
202         # LOE rbx r15 r12d r13d
204         movq    16(%rsp), %r12
205         cfi_restore(12)
206         movq    8(%rsp), %r13
207         cfi_restore(13)
208         movq    (%rsp), %r14
209         cfi_restore(14)
210         vmovups 128(%rsp), %zmm0
212         /* Go to exit */
213         jmp     L(EXIT)
214         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
215         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
216         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
217         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
218         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
219         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
220         # LOE rbx r12 r13 r14 r15 zmm0
222         /* Scalar math fucntion call
223          * to process special input
224          */
226 L(SCALAR_MATH_CALL):
227         movl    %r12d, %r14d
228         vmovsd  64(%rsp, %r14, 8), %xmm0
229         call    log1p@PLT
230         # LOE rbx r14 r15 r12d r13d xmm0
232         vmovsd  %xmm0, 128(%rsp, %r14, 8)
234         /* Process special inputs in loop */
235         jmp     L(SPECIAL_VALUES_LOOP)
236         # LOE rbx r15 r12d r13d
237 END(_ZGVeN8v_log1p_skx)
239         .section .rodata, "a"
240         .align  64
242 #ifdef __svml_dlog1p_data_internal_avx512_typedef
243 typedef unsigned int VUINT32;
244 typedef struct {
245         __declspec(align(64)) VUINT32 Log_tbl[16][2];
246         __declspec(align(64)) VUINT32 One[8][2];
247         __declspec(align(64)) VUINT32 SgnMask[8][2];
248         __declspec(align(64)) VUINT32 C075[8][2];
249         __declspec(align(64)) VUINT32 poly_coeff9[8][2];
250         __declspec(align(64)) VUINT32 poly_coeff8[8][2];
251         __declspec(align(64)) VUINT32 poly_coeff7[8][2];
252         __declspec(align(64)) VUINT32 poly_coeff6[8][2];
253         __declspec(align(64)) VUINT32 poly_coeff5[8][2];
254         __declspec(align(64)) VUINT32 poly_coeff4[8][2];
255         __declspec(align(64)) VUINT32 poly_coeff3[8][2];
256         __declspec(align(64)) VUINT32 poly_coeff2[8][2];
257         __declspec(align(64)) VUINT32 L2[8][2];
258 } __svml_dlog1p_data_internal_avx512;
259 #endif
260 __svml_dlog1p_data_internal_avx512:
261         /* Log_tbl */
262         .quad   0x0000000000000000
263         .quad   0xbfaf0a30c01162a6
264         .quad   0xbfbe27076e2af2e6
265         .quad   0xbfc5ff3070a793d4
266         .quad   0xbfcc8ff7c79a9a22
267         .quad   0xbfd1675cababa60e
268         .quad   0xbfd4618bc21c5ec2
269         .quad   0xbfd739d7f6bbd007
270         .quad   0x3fd269621134db92
271         .quad   0x3fcf991c6cb3b379
272         .quad   0x3fca93ed3c8ad9e3
273         .quad   0x3fc5bf406b543db2
274         .quad   0x3fc1178e8227e47c
275         .quad   0x3fb9335e5d594989
276         .quad   0x3fb08598b59e3a07
277         .quad   0x3fa0415d89e74444
278         /* One */
279         .align  64
280         .quad   0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
281         /* SgnMask */
282         .align  64
283         .quad   0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
284         /* C075 0.75 */
285         .align  64
286         .quad   0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
287         /* poly_coeff9 */
288         .align  64
289         .quad   0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70
290         /* poly_coeff8 */
291         .align  64
292         .quad   0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62
293         /* poly_coeff7 */
294         .align  64
295         .quad   0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF
296         /* poly_coeff6 */
297         .align  64
298         .quad   0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06
299         /* poly_coeff5 */
300         .align  64
301         .quad   0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C
302         /* poly_coeff4 */
303         .align  64
304         .quad   0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD
305         /* poly_coeff3 */
306         .align  64
307         .quad   0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466
308         /* poly_coeff2 */
309         .align  64
310         .quad   0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6
311         /* L2 = log(2) */
312         .align  64
313         .quad   0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF
314         .align  64
315         .type   __svml_dlog1p_data_internal_avx512, @object
316         .size   __svml_dlog1p_data_internal_avx512, .-__svml_dlog1p_data_internal_avx512