1 /* Function asinhf vectorized with SSE4.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute asinh(x) as log(x + sqrt(x*x + 1))
26 * asinh(NaN) = quiet NaN, and raise invalid exception
27 * asinh(INF) = that INF
32 /* Offsets for data table __svml_sasinh_data_internal
38 #define iOffExpoMask 176
39 #define sBigThreshold 192
43 #define sLargestFinite 256
44 #define sLittleThreshold 272
46 #define sThirtyOne 304
47 #define sTopMask11 320
54 .section .text.sse4, "ax", @progbits
55 ENTRY(_ZGVbN4v_asinhf_sse4)
57 cfi_def_cfa_offset(80)
61 * Split X into high and low parts, XHi (<= 11 bits) and XLo (<= 13 bits)
62 * We could use either X or |X| here, but it doesn't seem to matter
64 movups sTopMask11+__svml_sasinh_data_internal(%rip), %xmm10
69 * Compute X^2 = (XHi + XLo)^2 = XHi^2 + XLo * (X + XHi)
70 * The two parts are shifted off by around 11 bits. So even though
71 * the low bit will not in general be exact, it's near enough
78 /* Load the constant 1 and a sign mask */
79 movups sOne+__svml_sasinh_data_internal(%rip), %xmm7
82 * Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
83 * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
84 * as the dominant component in the compensated summation. Otherwise,
85 * if |X| >= 1, then since X2Hi only has 22 significant bits, the basic
86 * addition will be exact anyway until we get to |X| >= 2^24. But by
87 * that time the log function is well-conditioned enough that the
88 * rounding error doesn't matter. Hence we can treat 1 as dominant even
89 * if it literally isn't.
93 movups sTopMask8+__svml_sasinh_data_internal(%rip), %xmm12
101 * Unfortunately, we can still be in trouble if |X| <= 2^-5, since
102 * the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
103 * by 1/X and comes close to our threshold. Hence if |X| <= 2^-4,
104 * perform an alternative computation
105 * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
113 * Compute R = 1/sqrt(Y + W) * (1 + d)
114 * Force R to <= 8 significant bits.
115 * This means that R * Y and R^2 * Y are exactly representable.
117 rsqrtps %xmm0, %xmm14
123 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
124 * and T = (W/sqrt(Y + W)) * (1 + d)
125 * so that S + T = sqrt(Y + W) * (1 + d)
126 * S is exact, and the rounding error in T is OK.
132 * Get the absolute value of the input, since we will exploit antisymmetry
133 * and mostly assume X >= 0 in the core computation
135 movups SgnMask+__svml_sasinh_data_internal(%rip), %xmm6
138 * Compute e = -(2 * d + d^2)
139 * The first FMR is exact, and the rounding error in the other is acceptable
140 * since d and e are ~ 2^-8
142 movaps %xmm14, %xmm13
146 * Obtain sqrt(1 + X^2) - 1 in two pieces
149 * = (S + T) * (1 + Corr) - 1
150 * = [S - 1] + [T + (S + T) * Corr]
151 * We need a compensated summation for the last part. We treat S - 1
152 * as the larger part; it certainly is until about X < 2^-4, and in that
153 * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
154 * Final sum is dTmp5 (hi) + dTmp7 (lo)
159 * Check whether the input is finite, by checking |X| <= MaxFloat
160 * Otherwise set the rangemask so that the callout will get used.
161 * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
166 * The following computation can go wrong for very large X, basically
167 * because X^2 overflows. But for large X we have
168 * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
169 * we can just later stick X back into the log and tweak up the exponent.
170 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
171 * to stay in the safe range for the later log computation.
172 * Compute a flag now telling us when do do this.
175 cmpnleps sLargestFinite+__svml_sasinh_data_internal(%rip), %xmm9
176 cmpltps sBigThreshold+__svml_sasinh_data_internal(%rip), %xmm5
186 * = 1 / (1 + (sqrt(1 - e) - 1))
188 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
189 * So compute the first three nonconstant terms of that, so that
190 * we have a relative correction (1 + Corr) to apply to S etc.
195 movups sC3+__svml_sasinh_data_internal(%rip), %xmm15
197 movups sHalf+__svml_sasinh_data_internal(%rip), %xmm10
200 /* sX2over2 = X^2/2 */
204 /* sX46 = -X^4/4 + X^6/8 */
209 * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
210 * It's always safe to assume |X| is larger.
211 * This is the final 2-part argument to the log1p function
214 addps sC2+__svml_sasinh_data_internal(%rip), %xmm15
220 /* Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. */
221 movups XScale+__svml_sasinh_data_internal(%rip), %xmm15
228 /* sX4over4 = X^4/4 */
235 * Now we feed into the log1p code, using H in place of _VARG1 and
236 * also adding L into Xl.
237 * compute 1+x as high, low parts
241 /* sX46over2 = -X^4/8 + x^6/16 */
248 /* Now multiplex the two possible computations */
250 cmpleps sLittleThreshold+__svml_sasinh_data_internal(%rip), %xmm3
253 andnps %xmm11, %xmm13
276 * Now resume the main code.
277 * reduction: compute r, n
279 movdqu iBrkValue+__svml_sasinh_data_internal(%rip), %xmm6
284 movdqu iOffExpoMask+__svml_sasinh_data_internal(%rip), %xmm2
287 cvtdq2ps %xmm4, %xmm3
294 /* polynomial evaluation */
296 movups sPoly+112+__svml_sasinh_data_internal(%rip), %xmm7
301 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
302 movups sThirtyOne+__svml_sasinh_data_internal(%rip), %xmm0
303 addps sPoly+96+__svml_sasinh_data_internal(%rip), %xmm7
309 addps sPoly+80+__svml_sasinh_data_internal(%rip), %xmm7
311 /* final reconstruction */
312 mulps sLn2+__svml_sasinh_data_internal(%rip), %xmm2
315 /* Finally, reincorporate the original sign. */
316 movups sSign+__svml_sasinh_data_internal(%rip), %xmm0
318 addps sPoly+64+__svml_sasinh_data_internal(%rip), %xmm7
320 addps sPoly+48+__svml_sasinh_data_internal(%rip), %xmm7
322 addps sPoly+32+__svml_sasinh_data_internal(%rip), %xmm7
324 addps sPoly+16+__svml_sasinh_data_internal(%rip), %xmm7
326 addps sPoly+__svml_sasinh_data_internal(%rip), %xmm7
334 /* Go to special inputs processing branch */
335 jne L(SPECIAL_VALUES_BRANCH)
336 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm8
339 * and exit the function
344 cfi_def_cfa_offset(8)
346 cfi_def_cfa_offset(80)
352 L(SPECIAL_VALUES_BRANCH):
353 movups %xmm8, 32(%rsp)
354 movups %xmm0, 48(%rsp)
355 # LOE rbx rbp r12 r13 r14 r15 edx
366 # LOE rbx rbp r15 r12d r13d
375 /* Call scalar math function */
376 jc L(SCALAR_MATH_CALL)
377 # LOE rbx rbp r15 r12d r13d
383 L(SPECIAL_VALUES_LOOP):
387 /* Check bits in range mask */
388 jl L(RANGEMASK_CHECK)
389 # LOE rbx rbp r15 r12d r13d
397 movups 48(%rsp), %xmm0
404 # LOE rbx rbp r12 r13 r14 r15 xmm0
406 /* Scalar math fucntion call
407 * to process special input
412 movss 32(%rsp, %r14, 4), %xmm0
414 # LOE rbx rbp r14 r15 r12d r13d xmm0
416 movss %xmm0, 48(%rsp, %r14, 4)
418 /* Process special inputs in loop */
419 jmp L(SPECIAL_VALUES_LOOP)
420 # LOE rbx rbp r15 r12d r13d
421 END(_ZGVbN4v_asinhf_sse4)
423 .section .rodata, "a"
426 #ifdef __svml_sasinh_data_internal_typedef
427 typedef unsigned int VUINT32;
429 __declspec(align(16)) VUINT32 SgnMask[4][1];
430 __declspec(align(16)) VUINT32 sOne[4][1];
431 __declspec(align(16)) VUINT32 sPoly[8][4][1];
432 __declspec(align(16)) VUINT32 iBrkValue[4][1];
433 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
434 __declspec(align(16)) VUINT32 sBigThreshold[4][1];
435 __declspec(align(16)) VUINT32 sC2[4][1];
436 __declspec(align(16)) VUINT32 sC3[4][1];
437 __declspec(align(16)) VUINT32 sHalf[4][1];
438 __declspec(align(16)) VUINT32 sLargestFinite[4][1];
439 __declspec(align(16)) VUINT32 sLittleThreshold[4][1];
440 __declspec(align(16)) VUINT32 sSign[4][1];
441 __declspec(align(16)) VUINT32 sThirtyOne[4][1];
442 __declspec(align(16)) VUINT32 sTopMask11[4][1];
443 __declspec(align(16)) VUINT32 sTopMask8[4][1];
444 __declspec(align(16)) VUINT32 XScale[4][1];
445 __declspec(align(16)) VUINT32 sLn2[4][1];
446 } __svml_sasinh_data_internal;
448 __svml_sasinh_data_internal:
450 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
453 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
454 /* sPoly[] = SP polynomial */
456 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
457 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
458 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
459 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
460 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
461 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
462 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
463 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
464 /* iBrkValue = SP 2/3 */
466 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
467 /* iOffExpoMask = SP significand mask */
469 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
472 .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
475 .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
478 .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
481 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
484 .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
485 /* sLittleThreshold */
487 .long 0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000
490 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
493 .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
496 .long 0xFFFFE000, 0xFFFFE000, 0xFFFFE000, 0xFFFFE000
499 .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
502 .long 0x30800000, 0x30800000, 0x30800000, 0x30800000
503 /* sLn2 = SP ln(2) */
505 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
507 .type __svml_sasinh_data_internal, @object
508 .size __svml_sasinh_data_internal, .-__svml_sasinh_data_internal