1 /* Function atanh vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 * using small lookup table that map to AVX-512 permute instructions
30 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
34 /* Offsets for data table __svml_datanh_data_internal_avx512
41 #define RcpBitMask 448
42 #define poly_coeff8 512
43 #define poly_coeff7 576
44 #define poly_coeff6 640
45 #define poly_coeff5 704
46 #define poly_coeff4 768
47 #define poly_coeff3 832
48 #define poly_coeff2 896
49 #define poly_coeff1 960
50 #define poly_coeff0 1024
57 .section .text.evex512, "ax", @progbits
58 ENTRY(_ZGVeN8v_atanh_skx)
60 cfi_def_cfa_offset(16)
66 vmovups One+__svml_datanh_data_internal_avx512(%rip), %zmm15
68 /* round reciprocals to 1+4b mantissas */
69 vmovups AddB5+__svml_datanh_data_internal_avx512(%rip), %zmm6
70 vmovups RcpBitMask+__svml_datanh_data_internal_avx512(%rip), %zmm9
72 vandpd AbsMask+__svml_datanh_data_internal_avx512(%rip), %zmm2, %zmm13
75 vaddpd {rn-sae}, %zmm15, %zmm13, %zmm0
78 vsubpd {rn-sae}, %zmm13, %zmm15, %zmm4
79 vxorpd %zmm13, %zmm2, %zmm1
82 vsubpd {rn-sae}, %zmm15, %zmm0, %zmm7
85 vsubpd {rn-sae}, %zmm15, %zmm4, %zmm12
93 /* input outside (-1, 1) ? */
94 vcmppd $21, {sae}, %zmm15, %zmm13, %k0
95 vpaddq %zmm6, %zmm3, %zmm11
96 vpaddq %zmm6, %zmm5, %zmm10
99 vsubpd {rn-sae}, %zmm7, %zmm13, %zmm8
100 vandpd %zmm9, %zmm11, %zmm14
101 vandpd %zmm9, %zmm10, %zmm3
104 vaddpd {rn-sae}, %zmm12, %zmm13, %zmm12
106 /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
107 vfmsub213pd {rn-sae}, %zmm15, %zmm14, %zmm0
109 /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
110 vfmsub231pd {rn-sae}, %zmm3, %zmm4, %zmm15
113 vgetexppd {sae}, %zmm14, %zmm5
114 vgetexppd {sae}, %zmm3, %zmm4
117 vmovups __svml_datanh_data_internal_avx512(%rip), %zmm9
118 vmovups Log_tbl_H+64+__svml_datanh_data_internal_avx512(%rip), %zmm13
119 vmovups Log_tbl_L+__svml_datanh_data_internal_avx512(%rip), %zmm7
120 vfmadd231pd {rn-sae}, %zmm14, %zmm8, %zmm0
121 vfnmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm15
123 /* Prepare table index */
124 vpsrlq $48, %zmm14, %zmm11
125 vpsrlq $48, %zmm3, %zmm8
126 vmovups Log_tbl_L+64+__svml_datanh_data_internal_avx512(%rip), %zmm14
129 vmovups poly_coeff8+__svml_datanh_data_internal_avx512(%rip), %zmm3
132 vsubpd {rn-sae}, %zmm5, %zmm4, %zmm5
133 vmovups poly_coeff7+__svml_datanh_data_internal_avx512(%rip), %zmm4
135 vmovaps %zmm11, %zmm10
137 vpermi2pd %zmm13, %zmm9, %zmm10
138 vpermi2pd %zmm14, %zmm7, %zmm11
139 vpermt2pd %zmm13, %zmm8, %zmm9
140 vpermt2pd %zmm14, %zmm8, %zmm7
141 vmovups poly_coeff6+__svml_datanh_data_internal_avx512(%rip), %zmm8
142 vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm6
143 vfmadd231pd {rn-sae}, %zmm15, %zmm3, %zmm4
144 vmovups poly_coeff3+__svml_datanh_data_internal_avx512(%rip), %zmm13
145 vmovups poly_coeff2+__svml_datanh_data_internal_avx512(%rip), %zmm14
146 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
147 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
148 vmovups poly_coeff0+__svml_datanh_data_internal_avx512(%rip), %zmm8
149 vsubpd {rn-sae}, %zmm11, %zmm7, %zmm12
152 vsubpd {rn-sae}, %zmm10, %zmm9, %zmm3
153 vmovups poly_coeff5+__svml_datanh_data_internal_avx512(%rip), %zmm7
154 vmovups poly_coeff4+__svml_datanh_data_internal_avx512(%rip), %zmm9
157 vmovups L2H+__svml_datanh_data_internal_avx512(%rip), %zmm10
160 vmovups L2L+__svml_datanh_data_internal_avx512(%rip), %zmm11
161 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
162 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
163 vmovups poly_coeff1+__svml_datanh_data_internal_avx512(%rip), %zmm7
164 vfmadd231pd {rn-sae}, %zmm5, %zmm10, %zmm3
165 vfmadd213pd {rn-sae}, %zmm12, %zmm11, %zmm5
166 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm6
167 vfmadd213pd {rn-sae}, %zmm9, %zmm15, %zmm4
168 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm6
169 vfmadd213pd {rn-sae}, %zmm13, %zmm15, %zmm4
170 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm6
171 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm4
172 vfmadd213pd {rn-sae}, %zmm7, %zmm0, %zmm6
173 vfmadd213pd {rn-sae}, %zmm7, %zmm15, %zmm4
174 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm6
175 vfmadd213pd {rn-sae}, %zmm8, %zmm15, %zmm4
177 /* (K*L2L + Tl) + Rp*PolyP */
178 vfmadd213pd {rn-sae}, %zmm5, %zmm0, %zmm6
179 vorpd Half+__svml_datanh_data_internal_avx512(%rip), %zmm1, %zmm0
181 /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
182 vfnmadd213pd {rn-sae}, %zmm6, %zmm15, %zmm4
183 vaddpd {rn-sae}, %zmm4, %zmm3, %zmm1
184 vmulpd {rn-sae}, %zmm0, %zmm1, %zmm0
187 /* Go to special inputs processing branch */
188 jne L(SPECIAL_VALUES_BRANCH)
189 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
192 * and exit the function
208 L(SPECIAL_VALUES_BRANCH):
209 vmovups %zmm2, 64(%rsp)
210 vmovups %zmm0, 128(%rsp)
211 # LOE rbx r12 r13 r14 r15 edx zmm0
214 # LOE rbx r12 r13 r14 r15 eax edx
218 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
219 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
222 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
223 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
226 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
227 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
228 # LOE rbx r15 r12d r13d
237 /* Call scalar math function */
238 jc L(SCALAR_MATH_CALL)
239 # LOE rbx r15 r12d r13d
245 L(SPECIAL_VALUES_LOOP):
249 /* Check bits in range mask */
250 jl L(RANGEMASK_CHECK)
251 # LOE rbx r15 r12d r13d
259 vmovups 128(%rsp), %zmm0
263 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
264 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
265 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
266 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
267 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
268 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
269 # LOE rbx r12 r13 r14 r15 zmm0
271 /* Scalar math fucntion call
272 * to process special input
277 vmovsd 64(%rsp, %r14, 8), %xmm0
279 # LOE rbx r14 r15 r12d r13d xmm0
281 vmovsd %xmm0, 128(%rsp, %r14, 8)
283 /* Process special inputs in loop */
284 jmp L(SPECIAL_VALUES_LOOP)
285 # LOE rbx r15 r12d r13d
286 END(_ZGVeN8v_atanh_skx)
288 .section .rodata, "a"
291 #ifdef __svml_datanh_data_internal_avx512_typedef
292 typedef unsigned int VUINT32;
294 __declspec(align(64)) VUINT32 Log_tbl_H[16][2];
295 __declspec(align(64)) VUINT32 Log_tbl_L[16][2];
296 __declspec(align(64)) VUINT32 One[8][2];
297 __declspec(align(64)) VUINT32 AbsMask[8][2];
298 __declspec(align(64)) VUINT32 AddB5[8][2];
299 __declspec(align(64)) VUINT32 RcpBitMask[8][2];
300 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
301 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
302 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
303 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
304 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
305 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
306 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
307 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
308 __declspec(align(64)) VUINT32 poly_coeff0[8][2];
309 __declspec(align(64)) VUINT32 Half[8][2];
310 __declspec(align(64)) VUINT32 L2H[8][2];
311 __declspec(align(64)) VUINT32 L2L[8][2];
312 } __svml_datanh_data_internal_avx512;
314 __svml_datanh_data_internal_avx512:
316 .quad 0x0000000000000000
317 .quad 0x3faf0a30c0100000
318 .quad 0x3fbe27076e2a0000
319 .quad 0x3fc5ff3070a80000
320 .quad 0x3fcc8ff7c79b0000
321 .quad 0x3fd1675cabab8000
322 .quad 0x3fd4618bc21c8000
323 .quad 0x3fd739d7f6bc0000
324 .quad 0x3fd9f323ecbf8000
325 .quad 0x3fdc8ff7c79a8000
326 .quad 0x3fdf128f5faf0000
327 .quad 0x3fe0be72e4254000
328 .quad 0x3fe1e85f5e704000
329 .quad 0x3fe307d7334f0000
330 .quad 0x3fe41d8fe8468000
331 .quad 0x3fe52a2d265bc000
334 .quad 0x0000000000000000
335 .quad 0x3d662a6617cc9717
336 .quad 0x3d6e5cbd3d50fffc
337 .quad 0xbd6b0b0de3077d7e
338 .quad 0xbd697794f689f843
339 .quad 0x3d630701ce63eab9
340 .quad 0xbd609ec17a426426
341 .quad 0xbd67fcb18ed9d603
342 .quad 0x3d584bf2b68d766f
343 .quad 0x3d5a21ac25d81ef3
344 .quad 0x3d3bb2cd720ec44c
345 .quad 0xbd657d49676844cc
346 .quad 0x3d1a07bd8b34be7c
347 .quad 0x3d60be1fb590a1f5
348 .quad 0xbd5aa33736867a17
349 .quad 0x3d46abb9df22bc57
352 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
355 .quad 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff
358 .quad 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000, 0x0000800000000000
361 .quad 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000, 0xffff000000000000
364 .quad 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142, 0x3fbc81dd40d38142
367 .quad 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70, 0xbfc0073cb82e8b70
370 .quad 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8, 0x3fc2492298ffdae8
373 .quad 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5, 0xbfc55553f871e5c5
376 .quad 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a, 0x3fc9999999cd394a
379 .quad 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01, 0xbfd00000000c2a01
382 .quad 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462, 0x3fd5555555555462
385 .quad 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5, 0xbfdfffffffffffc5
388 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
391 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
392 /* L2H = log(2)_high */
394 .quad 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000, 0x3fe62E42FEFA0000
395 /* L2L = log(2)_low */
397 .quad 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000, 0x3d7cf79abc9e0000
399 .type __svml_datanh_data_internal_avx512, @object
400 .size __svml_datanh_data_internal_avx512, .-__svml_datanh_data_internal_avx512