1 /* Function atanhf vectorized with SSE4.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
29 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
33 /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
34 by use in the function. On cold-starts this might help the
35 prefetcher. Possibly a better idea is to interleave start/end so
36 that the prefetcher is less likely to detect a stream and pull
37 irrelivant lines into cache. */
42 #define iOffExpoMask 64
48 #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal)
50 .section .text.sse4, "ax", @progbits
51 ENTRY(_ZGVbN4v_atanhf_sse4)
54 /* Load constants including One = 1 */
55 movups ATANHF_DATA(sOne)(%rip), %xmm4
58 /* Strip off the sign, so treat X as positive until right at the end */
59 movups ATANHF_DATA(SgnMask)(%rip), %xmm1
63 movups ATANHF_DATA(sTopMask12)(%rip), %xmm11
69 * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
70 * the upper part UHi being <= 12 bits long. Then we have
71 * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
84 * Check whether |X| < 1, in which case we use the main function.
85 * Otherwise set the rangemask so that the callout will get used.
86 * Note that this will also use the callout for NaNs since not(NaN < 1).
94 * Split V as well into upper 12 bits and lower part, so that we can get
95 * a preliminary quotient estimate without rounding error.
102 /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
109 /* Compute D = E + E^2 */
110 movaps %xmm14, %xmm13
114 /* reduction: compute r,n */
115 movdqu ATANHF_DATA(iBrkValue)(%rip), %xmm9
119 * Compute R * (VHi + VLo) * (1 + E + E^2)
120 * = R * (VHi + VLo) * (1 + D)
121 * = QHi + (QHi * D + QLo + QLo * D)
127 movdqu ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
130 /* Record the sign for eventual reincorporation. */
135 * Now finally accumulate the high and low parts of the
136 * argument to log1p, H + L, with a final compensated summation.
141 /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
154 cvtdq2ps %xmm10, %xmm13
157 /* final reconstruction */
162 /* polynomial evaluation */
165 movups ATANHF_DATA(sPoly+0)(%rip), %xmm7
170 /* Finally, halve the result and reincorporate the sign */
171 addps ATANHF_DATA(sPoly+16)(%rip), %xmm7
173 addps ATANHF_DATA(sPoly+32)(%rip), %xmm7
175 addps ATANHF_DATA(sPoly+48)(%rip), %xmm7
177 addps ATANHF_DATA(sPoly+64)(%rip), %xmm7
179 addps ATANHF_DATA(sPoly+80)(%rip), %xmm7
181 addps ATANHF_DATA(sPoly+96)(%rip), %xmm7
183 movaps ATANHF_DATA(sPoly+112)(%rip), %xmm6
187 mulps ATANHF_DATA(sLn2)(%rip), %xmm13
188 /* We can build `sHalf` with `sPoly & sOne`. */
197 /* Finish check of NaNs. */
200 cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
207 /* Go to special inputs processing branch. */
208 jne L(SPECIAL_VALUES_BRANCH)
209 # LOE rbx rbp r12 r13 r14 r15 xmm0
210 /* No registers to restore on fast path. */
214 /* Cold case. edx has 1s where there was a special value that
215 needs to be handled by a atanhf call. Optimize for code size
216 more so than speed here. */
217 L(SPECIAL_VALUES_BRANCH):
218 # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
219 /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
220 call entry will be 16-byte aligned. */
222 cfi_def_cfa_offset(64)
223 movups %xmm0, 24(%rsp)
224 movups %xmm5, 40(%rsp)
226 /* Use rbx/rbp for callee save registers as they get short
227 encoding for many instructions (as compared with r12/r13). */
232 /* edx has 1s where there was a special value that needs to be handled
235 L(SPECIAL_VALUES_LOOP):
236 # LOE rbx rbp r12 r13 r14 r15
237 /* use rbp as index for special value that is saved across calls to
238 tanhf. We technically don't need a callee save register here as offset
239 to rsp is always [0, 12] so we can restore rsp by realigning to 64.
240 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
245 /* Scalar math fucntion call to process special input. */
246 movss 40(%rsp, %rbp, 4), %xmm0
248 /* No good way to avoid the store-forwarding fault this will cause on
249 return. `lfence` avoids the SF fault but at greater cost as it
250 serialized stack/callee save restoration. */
251 movss %xmm0, 24(%rsp, %rbp, 4)
255 jnz L(SPECIAL_VALUES_LOOP)
256 # LOE r12 r13 r14 r15
257 /* All results have been written to 24(%rsp). */
258 movups 24(%rsp), %xmm0
264 cfi_def_cfa_offset(8)
266 END(_ZGVbN4v_atanhf_sse4)
268 .section .rodata, "a"
271 #ifdef __svml_satanh_data_internal_typedef
272 typedef unsigned int VUINT32;
274 __declspec(align(16)) VUINT32 sOne[4][1];
275 __declspec(align(16)) VUINT32 SgnMask[4][1];
276 __declspec(align(16)) VUINT32 sTopMask12[4][1];
277 __declspec(align(16)) VUINT32 iBrkValue[4][1];
278 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
279 __declspec(align(16)) VUINT32 sPoly[8][4][1];
280 __declspec(align(16)) VUINT32 sLn2[4][1];
281 __declspec(align(16)) VUINT32 TinyRange[4][1];
282 } __svml_satanh_data_internal;
285 __svml_satanh_data_internal:
288 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
290 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
293 .long 0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
294 /* iBrkValue = SP 2/3 */
296 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
297 /* iOffExpoMask = SP significand mask ==*/
299 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
301 /* sPoly[] = SP polynomial */
303 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
304 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
305 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
306 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
307 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
308 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
309 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
310 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
312 /* sLn2 = SP ln(2) */
314 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
317 .long 0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
319 .type __svml_satanh_data_internal, @object
320 .size __svml_satanh_data_internal, .-__svml_satanh_data_internal