x86: Improve svml_s_atanhf8_core_avx2.S
[glibc.git] / sysdeps / x86_64 / fpu / multiarch / svml_d_atanh8_core_avx512.S
blob3193c026dd24a81f5fc9e6c0de84a484c53d0bd3
1 /* Function atanh vectorized with AVX-512.
2    Copyright (C) 2021-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    https://www.gnu.org/licenses/.  */
20  * ALGORITHM DESCRIPTION:
21  *
22  *   Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23  *   using small lookup table that map to AVX-512 permute instructions
24  *
25  *   Special cases:
26  *
27  *   atanh(0)  = 0
28  *   atanh(+1) = +INF
29  *   atanh(-1) = -INF
30  *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
31  *
32  */
34 /* Offsets for data table __svml_datanh_data_internal_avx512
35  */
36 #define Log_tbl_H                       0
37 #define Log_tbl_L                       128
38 #define One                             256
39 #define AbsMask                         320
40 #define AddB5                           384
41 #define RcpBitMask                      448
42 #define poly_coeff8                     512
43 #define poly_coeff7                     576
44 #define poly_coeff6                     640
45 #define poly_coeff5                     704
46 #define poly_coeff4                     768
47 #define poly_coeff3                     832
48 #define poly_coeff2                     896
49 #define poly_coeff1                     960
50 #define poly_coeff0                     1024
51 #define Half                            1088
52 #define L2H                             1152
53 #define L2L                             1216
55 #include <sysdep.h>
57         .section .text.evex512, "ax", @progbits
58 ENTRY(_ZGVeN8v_atanh_skx)
59         pushq   %rbp
60         cfi_def_cfa_offset(16)
61         movq    %rsp, %rbp
62         cfi_def_cfa(6, 16)
63         cfi_offset(6, -16)
64         andq    $-64, %rsp
65         subq    $192, %rsp
66         vmovups One+__svml_datanh_data_internal_avx512(%rip), %zmm15
68         /* round reciprocals to 1+4b mantissas */
69         vmovups AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6
70         vmovups RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9
71         vmovaps %zmm0, %zmm2
72         vandpd  AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13
74         /* 1+y */
75         vaddpd  {rn-sae}, %zmm15, %zmm13, %zmm0
77         /* 1-y */
78         vsubpd  {rn-sae}, %zmm13, %zmm15, %zmm4
79         vxorpd  %zmm13, %zmm2, %zmm1
81         /* Yp_high */
82         vsubpd  {rn-sae}, %zmm15, %zmm0, %zmm7
84         /* -Ym_high */
85         vsubpd  {rn-sae}, %zmm15, %zmm4, %zmm12
87         /* RcpP ~ 1/Yp */
88         vrcp14pd %zmm0, %zmm3
90         /* RcpM ~ 1/Ym */
91         vrcp14pd %zmm4, %zmm5
93         /* input outside (-1, 1) ? */
94         vcmppd  $21, {sae}, %zmm15, %zmm13, %k0
95         vpaddq  %zmm6, %zmm3, %zmm11
96         vpaddq  %zmm6, %zmm5, %zmm10
98         /* Yp_low */
99         vsubpd  {rn-sae}, %zmm7, %zmm13, %zmm8
100         vandpd  %zmm9, %zmm11, %zmm14
101         vandpd  %zmm9, %zmm10, %zmm3
103         /* Ym_low */
104         vaddpd  {rn-sae}, %zmm12, %zmm13, %zmm12
106         /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
107         vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0
109         /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
110         vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15
112         /* exponents */
113         vgetexppd {sae}, %zmm14, %zmm5
114         vgetexppd {sae}, %zmm3, %zmm4
116         /* Table lookups */
117         vmovups __svml_datanh_data_internal_avx512(%rip), %zmm9
118         vmovups Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13
119         vmovups Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7
120         vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0
121         vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15
123         /* Prepare table index */
124         vpsrlq  $48, %zmm14, %zmm11
125         vpsrlq  $48, %zmm3, %zmm8
126         vmovups Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14
128         /* polynomials */
129         vmovups poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3
131         /* Km-Kp */
132         vsubpd  {rn-sae}, %zmm5, %zmm4, %zmm5
133         vmovups poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4
134         kmovw   %k0, %edx
135         vmovaps %zmm11, %zmm10
136         vmovaps %zmm4, %zmm6
137         vpermi2pd %zmm13, %zmm9, %zmm10
138         vpermi2pd %zmm14, %zmm7, %zmm11
139         vpermt2pd %zmm13, %zmm8, %zmm9
140         vpermt2pd %zmm14, %zmm8, %zmm7
141         vmovups poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8
142         vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6
143         vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4
144         vmovups poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13
145         vmovups poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14
146         vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
147         vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
148         vmovups poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8
149         vsubpd  {rn-sae}, %zmm11, %zmm7, %zmm12
151         /* table values */
152         vsubpd  {rn-sae}, %zmm10, %zmm9, %zmm3
153         vmovups poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7
154         vmovups poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9
156         /* K*L2H + Th */
157         vmovups L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10
159         /* K*L2L + Tl */
160         vmovups L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11
161         vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
162         vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
163         vmovups poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7
164         vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3
165         vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5
166         vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6
167         vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4
168         vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6
169         vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4
170         vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6
171         vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4
172         vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
173         vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
174         vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
175         vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
177         /* (K*L2L + Tl) + Rp*PolyP */
178         vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6
179         vorpd   Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0
181         /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
182         vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4
183         vaddpd  {rn-sae}, %zmm4, %zmm3, %zmm1
184         vmulpd  {rn-sae}, %zmm0, %zmm1, %zmm0
185         testl   %edx, %edx
187         /* Go to special inputs processing branch */
188         jne     L(SPECIAL_VALUES_BRANCH)
189         # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
191         /* Restore registers
192          * and exit the function
193          */
195 L(EXIT):
196         movq    %rbp, %rsp
197         popq    %rbp
198         cfi_def_cfa(7, 8)
199         cfi_restore(6)
200         ret
201         cfi_def_cfa(6, 16)
202         cfi_offset(6, -16)
204         /* Branch to process
205          * special inputs
206          */
208 L(SPECIAL_VALUES_BRANCH):
209         vmovups %zmm2, 64(%rsp)
210         vmovups %zmm0, 128(%rsp)
211         # LOE rbx r12 r13 r14 r15 edx zmm0
213         xorl    %eax, %eax
214         # LOE rbx r12 r13 r14 r15 eax edx
216         vzeroupper
217         movq    %r12, 16(%rsp)
218         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
219         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
220         movl    %eax, %r12d
221         movq    %r13, 8(%rsp)
222         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
223         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
224         movl    %edx, %r13d
225         movq    %r14, (%rsp)
226         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
227         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
228         # LOE rbx r15 r12d r13d
230         /* Range mask
231          * bits check
232          */
234 L(RANGEMASK_CHECK):
235         btl     %r12d, %r13d
237         /* Call scalar math function */
238         jc      L(SCALAR_MATH_CALL)
239         # LOE rbx r15 r12d r13d
241         /* Special inputs
242          * processing loop
243          */
245 L(SPECIAL_VALUES_LOOP):
246         incl    %r12d
247         cmpl    $8, %r12d
249         /* Check bits in range mask */
250         jl      L(RANGEMASK_CHECK)
251         # LOE rbx r15 r12d r13d
253         movq    16(%rsp), %r12
254         cfi_restore(12)
255         movq    8(%rsp), %r13
256         cfi_restore(13)
257         movq    (%rsp), %r14
258         cfi_restore(14)
259         vmovups 128(%rsp), %zmm0
261         /* Go to exit */
262         jmp     L(EXIT)
263         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus)  */
264         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
265         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus)  */
266         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
267         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus)  */
268         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
269         # LOE rbx r12 r13 r14 r15 zmm0
271         /* Scalar math fucntion call
272          * to process special input
273          */
275 L(SCALAR_MATH_CALL):
276         movl    %r12d, %r14d
277         movsd   64(%rsp, %r14, 8), %xmm0
278         call    atanh@PLT
279         # LOE rbx r14 r15 r12d r13d xmm0
281         movsd   %xmm0, 128(%rsp, %r14, 8)
283         /* Process special inputs in loop */
284         jmp     L(SPECIAL_VALUES_LOOP)
285         # LOE rbx r15 r12d r13d
286 END(_ZGVeN8v_atanh_skx)
288         .section .rodata, "a"
289         .align  64
291 #ifdef __svml_datanh_data_internal_avx512_typedef
292 typedef unsigned int VUINT32;
293 typedef struct {
294         __declspec(align(64)) VUINT32 Log_tbl_H[16][2];
295         __declspec(align(64)) VUINT32 Log_tbl_L[16][2];
296         __declspec(align(64)) VUINT32 One[8][2];
297         __declspec(align(64)) VUINT32 AbsMask[8][2];
298         __declspec(align(64)) VUINT32 AddB5[8][2];
299         __declspec(align(64)) VUINT32 RcpBitMask[8][2];
300         __declspec(align(64)) VUINT32 poly_coeff8[8][2];
301         __declspec(align(64)) VUINT32 poly_coeff7[8][2];
302         __declspec(align(64)) VUINT32 poly_coeff6[8][2];
303         __declspec(align(64)) VUINT32 poly_coeff5[8][2];
304         __declspec(align(64)) VUINT32 poly_coeff4[8][2];
305         __declspec(align(64)) VUINT32 poly_coeff3[8][2];
306         __declspec(align(64)) VUINT32 poly_coeff2[8][2];
307         __declspec(align(64)) VUINT32 poly_coeff1[8][2];
308         __declspec(align(64)) VUINT32 poly_coeff0[8][2];
309         __declspec(align(64)) VUINT32 Half[8][2];
310         __declspec(align(64)) VUINT32 L2H[8][2];
311         __declspec(align(64)) VUINT32 L2L[8][2];
312 } __svml_datanh_data_internal_avx512;
313 #endif
314 __svml_datanh_data_internal_avx512:
315         /* Log_tbl_H */
316         .quad   0x0000000000000000
317         .quad   0x3faf0a30c0100000
318         .quad   0x3fbe27076e2a0000
319         .quad   0x3fc5ff3070a80000
320         .quad   0x3fcc8ff7c79b0000
321         .quad   0x3fd1675cabab8000
322         .quad   0x3fd4618bc21c8000
323         .quad   0x3fd739d7f6bc0000
324         .quad   0x3fd9f323ecbf8000
325         .quad   0x3fdc8ff7c79a8000
326         .quad   0x3fdf128f5faf0000
327         .quad   0x3fe0be72e4254000
328         .quad   0x3fe1e85f5e704000
329         .quad   0x3fe307d7334f0000
330         .quad   0x3fe41d8fe8468000
331         .quad   0x3fe52a2d265bc000
332         /* Log_tbl_L */
333         .align  64
334         .quad   0x0000000000000000
335         .quad   0x3d662a6617cc9717
336         .quad   0x3d6e5cbd3d50fffc
337         .quad   0xbd6b0b0de3077d7e
338         .quad   0xbd697794f689f843
339         .quad   0x3d630701ce63eab9
340         .quad   0xbd609ec17a426426
341         .quad   0xbd67fcb18ed9d603
342         .quad   0x3d584bf2b68d766f
343         .quad   0x3d5a21ac25d81ef3
344         .quad   0x3d3bb2cd720ec44c
345         .quad   0xbd657d49676844cc
346         .quad   0x3d1a07bd8b34be7c
347         .quad   0x3d60be1fb590a1f5
348         .quad   0xbd5aa33736867a17
349         .quad   0x3d46abb9df22bc57
350         /* One */
351         .align  64
352         .quad   0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
353         /* AbsMask */
354         .align  64
355         .quad   0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
356         /* AddB5 */
357         .align  64
358         .quad   0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
359         /* RcpBitMask */
360         .align  64
361         .quad   0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
362         /* poly_coeff8 */
363         .align  64
364         .quad   0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142
365         /* poly_coeff7 */
366         .align  64
367         .quad   0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70
368         /* poly_coeff6 */
369         .align  64
370         .quad   0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8
371         /* poly_coeff5 */
372         .align  64
373         .quad   0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5
374         /* poly_coeff4 */
375         .align  64
376         .quad   0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a
377         /* poly_coeff3 */
378         .align  64
379         .quad   0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01
380         /* poly_coeff2 */
381         .align  64
382         .quad   0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462
383         /* poly_coeff1 */
384         .align  64
385         .quad   0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5
386         /* poly_coeff0 */
387         .align  64
388         .quad   0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
389         /* Half */
390         .align  64
391         .quad   0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
392         /* L2H = log(2)_high */
393         .align  64
394         .quad   0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
395         /* L2L = log(2)_low */
396         .align  64
397         .quad   0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
398         .align  64
399         .type   __svml_datanh_data_internal_avx512, @object
400         .size   __svml_datanh_data_internal_avx512, .-__svml_datanh_data_internal_avx512