1 /* Function acoshf vectorized with AVX2.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute acosh(x) as log(x + sqrt(x*x - 1))
26 * acosh(NaN) = quiet NaN, and raise invalid exception
29 * acosh(x) = NaN if x < 1
34 /* Offsets for data table __svml_sacosh_data_internal
39 #define iOffExpoMask 320
40 #define sBigThreshold 352
44 #define sLargestFinite 480
45 #define sThirtyOne 512
52 .section .text.avx2, "ax", @progbits
53 ENTRY(_ZGVdN8v_acoshf_avx2)
55 cfi_def_cfa_offset(16)
62 /* Load constants, always including One = 1 */
63 vmovups sOne+__svml_sacosh_data_internal(%rip), %ymm2
65 /* Finally, express Y + W = U * V accurately where Y has <= 8 bits */
66 vmovups sTopMask8+__svml_sacosh_data_internal(%rip), %ymm9
70 * = 1 / (1 + (sqrt(1 - e) - 1))
72 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
73 * So compute the first three nonconstant terms of that, so that
74 * we have a relative correction (1 + Corr) to apply to S etc.
79 vmovups sC3+__svml_sacosh_data_internal(%rip), %ymm14
82 vfmsub231ps %ymm3, %ymm3, %ymm7
85 * Check that 1 < X < +inf; otherwise go to the callout function.
86 * We need the callout for X = 1 to avoid division by zero below.
87 * This test ensures that callout handles NaN and either infinity.
89 vcmpnle_uqps sLargestFinite+__svml_sacosh_data_internal(%rip), %ymm3, %ymm4
90 vcmpngt_uqps %ymm2, %ymm3, %ymm5
93 * The following computation can go wrong for very large X, e.g.
94 * the X^2 - 1 = U * V can overflow. But for large X we have
95 * acosh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
96 * we can just later stick X back into the log and tweak up the exponent.
97 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
98 * to stay in the safe range for the later log computation.
99 * Compute a flag now telling us when to do this.
101 vcmplt_oqps sBigThreshold+__svml_sacosh_data_internal(%rip), %ymm3, %ymm1
102 vandps %ymm9, %ymm7, %ymm10
105 * Compute R = 1/sqrt(Y + W) * (1 + d)
106 * Force R to <= 8 significant bits.
107 * This means that R * Y and R^2 * Y are exactly representable.
109 vrsqrtps %ymm10, %ymm8
110 vsubps %ymm10, %ymm7, %ymm11
111 vandps %ymm9, %ymm8, %ymm12
114 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
115 * and T = (W/sqrt(Y + W)) * (1 + d)
116 * so that S + T = sqrt(Y + W) * (1 + d)
117 * S is exact, and the rounding error in T is OK.
119 vmulps %ymm12, %ymm10, %ymm15
120 vmulps %ymm11, %ymm12, %ymm0
122 /* Now multiplex to the case X = 2^-30 * input, Xl = 0 in the "big" case. */
123 vmulps XScale+__svml_sacosh_data_internal(%rip), %ymm3, %ymm11
126 * Compute e = -(2 * d + d^2)
127 * The first FMR is exact, and the rounding error in the other is acceptable
128 * since d and e are ~ 2^-8
130 vmovaps %ymm2, %ymm13
131 vfnmadd231ps %ymm15, %ymm12, %ymm13
132 vfnmadd231ps %ymm0, %ymm12, %ymm13
133 vfmadd213ps sC2+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
134 vfmadd213ps sHalf+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
135 vmulps %ymm14, %ymm13, %ymm7
136 vorps %ymm5, %ymm4, %ymm6
139 * For low-accuracy versions, the computation can be done
140 * just as U + ((S + T) + (S + T) * Corr)
142 vaddps %ymm0, %ymm15, %ymm5
144 /* sU is needed later on */
145 vsubps %ymm2, %ymm3, %ymm4
146 vfmadd213ps %ymm5, %ymm7, %ymm5
147 vmovmskps %ymm6, %edx
148 vaddps %ymm5, %ymm4, %ymm6
151 * Now resume the main code.
152 * reduction: compute r, n
154 vmovups iBrkValue+__svml_sacosh_data_internal(%rip), %ymm4
157 * Now we feed into the log1p code, using H in place of _VARG1 and
158 * also adding L into Xl.
159 * compute 1+x as high, low parts
161 vmaxps %ymm6, %ymm2, %ymm8
162 vminps %ymm6, %ymm2, %ymm9
163 vaddps %ymm9, %ymm8, %ymm12
164 vblendvps %ymm1, %ymm12, %ymm11, %ymm14
165 vsubps %ymm12, %ymm8, %ymm10
166 vpsubd %ymm4, %ymm14, %ymm15
167 vaddps %ymm10, %ymm9, %ymm13
168 vpand iOffExpoMask+__svml_sacosh_data_internal(%rip), %ymm15, %ymm14
169 vpsrad $23, %ymm15, %ymm15
170 vpaddd %ymm4, %ymm14, %ymm8
171 vpslld $23, %ymm15, %ymm5
172 vmovups sPoly+224+__svml_sacosh_data_internal(%rip), %ymm4
173 vcvtdq2ps %ymm15, %ymm0
174 vpsubd %ymm5, %ymm2, %ymm7
176 /* polynomial evaluation */
177 vsubps %ymm2, %ymm8, %ymm2
179 /* Add 31 to the exponent in the "large" case to get log(2 * input) */
180 vaddps sThirtyOne+__svml_sacosh_data_internal(%rip), %ymm0, %ymm5
181 vandps %ymm1, %ymm13, %ymm6
182 vmulps %ymm7, %ymm6, %ymm9
183 vblendvps %ymm1, %ymm0, %ymm5, %ymm0
184 vaddps %ymm2, %ymm9, %ymm2
185 vfmadd213ps sPoly+192+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
186 vfmadd213ps sPoly+160+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
187 vfmadd213ps sPoly+128+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
188 vfmadd213ps sPoly+96+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
189 vfmadd213ps sPoly+64+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
190 vfmadd213ps sPoly+32+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
191 vfmadd213ps sPoly+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
192 vmulps %ymm4, %ymm2, %ymm6
193 vfmadd213ps %ymm2, %ymm2, %ymm6
195 /* final reconstruction */
196 vfmadd132ps sLn2+__svml_sacosh_data_internal(%rip), %ymm6, %ymm0
199 /* Go to special inputs processing branch */
200 jne L(SPECIAL_VALUES_BRANCH)
201 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
204 * and exit the function
220 L(SPECIAL_VALUES_BRANCH):
221 vmovups %ymm3, 32(%rsp)
222 vmovups %ymm0, 64(%rsp)
223 # LOE rbx r12 r13 r14 r15 edx ymm0
226 # LOE rbx r12 r13 r14 r15 eax edx
230 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
231 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
234 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
235 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
238 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
239 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
240 # LOE rbx r15 r12d r13d
249 /* Call scalar math function */
250 jc L(SCALAR_MATH_CALL)
251 # LOE rbx r15 r12d r13d
257 L(SPECIAL_VALUES_LOOP):
261 /* Check bits in range mask */
262 jl L(RANGEMASK_CHECK)
263 # LOE rbx r15 r12d r13d
271 vmovups 64(%rsp), %ymm0
275 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
276 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
277 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
278 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
279 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
280 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
281 # LOE rbx r12 r13 r14 r15 ymm0
283 /* Scalar math fucntion call
284 * to process special input
289 vmovss 32(%rsp, %r14, 4), %xmm0
291 # LOE rbx r14 r15 r12d r13d xmm0
293 vmovss %xmm0, 64(%rsp, %r14, 4)
295 /* Process special inputs in loop */
296 jmp L(SPECIAL_VALUES_LOOP)
297 # LOE rbx r15 r12d r13d
298 END(_ZGVdN8v_acoshf_avx2)
300 .section .rodata, "a"
303 #ifdef __svml_sacosh_data_internal_typedef
304 typedef unsigned int VUINT32;
306 __declspec(align(32)) VUINT32 sOne[8][1];
307 __declspec(align(32)) VUINT32 sPoly[8][8][1];
308 __declspec(align(32)) VUINT32 iBrkValue[8][1];
309 __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
310 __declspec(align(32)) VUINT32 sBigThreshold[8][1];
311 __declspec(align(32)) VUINT32 sC2[8][1];
312 __declspec(align(32)) VUINT32 sC3[8][1];
313 __declspec(align(32)) VUINT32 sHalf[8][1];
314 __declspec(align(32)) VUINT32 sLargestFinite[8][1];
315 __declspec(align(32)) VUINT32 sThirtyOne[8][1];
316 __declspec(align(32)) VUINT32 sTopMask8[8][1];
317 __declspec(align(32)) VUINT32 XScale[8][1];
318 __declspec(align(32)) VUINT32 sLn2[8][1];
319 } __svml_sacosh_data_internal;
321 __svml_sacosh_data_internal:
323 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
324 /* sPoly[] = SP polynomial */
326 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
327 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
328 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
329 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
330 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
331 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
332 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
333 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
334 /* iBrkValue = SP 2/3 */
336 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
337 /* iOffExpoMask = SP significand mask */
339 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
342 .long 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
345 .long 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
348 .long 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
351 .long 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
354 .long 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
357 .long 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
360 .long 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
363 .long 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000
364 /* sLn2 = SP ln(2) */
366 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
368 .type __svml_sacosh_data_internal, @object
369 .size __svml_sacosh_data_internal, .-__svml_sacosh_data_internal