1 /* Function log1pf vectorized with AVX-512.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23 * Get short reciprocal approximation Rcp ~ 1/xh
24 * R = (Rcp*xh - 1.0) + Rcp*xl
25 * log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26 * log(Rcp) is tabulated
31 /* Offsets for data table __svml_slog1p_data_internal
46 #define iOffExpoMask 832
51 .section .text.evex512, "ax", @progbits
52 ENTRY(_ZGVeN16v_log1pf_skx)
54 cfi_def_cfa_offset(16)
60 vmovups sOne+__svml_slog1p_data_internal(%rip), %zmm2
62 /* reduction: compute r, n */
63 vmovups iBrkValue+__svml_slog1p_data_internal(%rip), %zmm12
64 vmovups SgnMask+__svml_slog1p_data_internal(%rip), %zmm4
67 /* compute 1+x as high, low parts */
68 vmaxps {sae}, %zmm3, %zmm2, %zmm5
69 vminps {sae}, %zmm3, %zmm2, %zmm7
70 vandnps %zmm3, %zmm4, %zmm1
71 vpternlogd $255, %zmm4, %zmm4, %zmm4
72 vaddps {rn-sae}, %zmm7, %zmm5, %zmm9
73 vpsubd %zmm12, %zmm9, %zmm10
74 vsubps {rn-sae}, %zmm9, %zmm5, %zmm6
76 /* check argument value ranges */
77 vpaddd iHiDelta+__svml_slog1p_data_internal(%rip), %zmm9, %zmm8
78 vpsrad $23, %zmm10, %zmm13
79 vmovups sPoly_5+__svml_slog1p_data_internal(%rip), %zmm9
80 vpcmpd $5, iLoRange+__svml_slog1p_data_internal(%rip), %zmm8, %k1
81 vpslld $23, %zmm13, %zmm14
82 vaddps {rn-sae}, %zmm7, %zmm6, %zmm15
83 vcvtdq2ps {rn-sae}, %zmm13, %zmm0
84 vpsubd %zmm14, %zmm2, %zmm13
85 vmovups sPoly_8+__svml_slog1p_data_internal(%rip), %zmm7
86 vmovups sPoly_1+__svml_slog1p_data_internal(%rip), %zmm14
87 vmulps {rn-sae}, %zmm13, %zmm15, %zmm6
88 vpandd iOffExpoMask+__svml_slog1p_data_internal(%rip), %zmm10, %zmm11
89 vpaddd %zmm12, %zmm11, %zmm5
90 vmovups sPoly_4+__svml_slog1p_data_internal(%rip), %zmm10
91 vmovups sPoly_3+__svml_slog1p_data_internal(%rip), %zmm11
92 vmovups sPoly_2+__svml_slog1p_data_internal(%rip), %zmm12
94 /* polynomial evaluation */
95 vsubps {rn-sae}, %zmm2, %zmm5, %zmm2
96 vaddps {rn-sae}, %zmm6, %zmm2, %zmm15
97 vmovups sPoly_7+__svml_slog1p_data_internal(%rip), %zmm2
98 vfmadd231ps {rn-sae}, %zmm15, %zmm7, %zmm2
99 vpandnd %zmm8, %zmm8, %zmm4{%k1}
100 vmovups sPoly_6+__svml_slog1p_data_internal(%rip), %zmm8
102 /* combine and get argument value range mask */
103 vptestmd %zmm4, %zmm4, %k0
104 vfmadd213ps {rn-sae}, %zmm8, %zmm15, %zmm2
106 vfmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm2
107 vfmadd213ps {rn-sae}, %zmm10, %zmm15, %zmm2
108 vfmadd213ps {rn-sae}, %zmm11, %zmm15, %zmm2
109 vfmadd213ps {rn-sae}, %zmm12, %zmm15, %zmm2
110 vfmadd213ps {rn-sae}, %zmm14, %zmm15, %zmm2
111 vmulps {rn-sae}, %zmm15, %zmm2, %zmm4
112 vfmadd213ps {rn-sae}, %zmm15, %zmm15, %zmm4
114 /* final reconstruction */
115 vmovups sLn2+__svml_slog1p_data_internal(%rip), %zmm15
116 vfmadd213ps {rn-sae}, %zmm4, %zmm15, %zmm0
117 vorps %zmm1, %zmm0, %zmm0
120 /* Go to special inputs processing branch */
121 jne L(SPECIAL_VALUES_BRANCH)
122 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm3
125 * and exit the function
141 L(SPECIAL_VALUES_BRANCH):
142 vmovups %zmm3, 64(%rsp)
143 vmovups %zmm0, 128(%rsp)
144 # LOE rbx r12 r13 r14 r15 edx zmm0
147 # LOE rbx r12 r13 r14 r15 eax edx
151 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
152 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
155 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
159 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
161 # LOE rbx r15 r12d r13d
170 /* Call scalar math function */
171 jc L(SCALAR_MATH_CALL)
172 # LOE rbx r15 r12d r13d
178 L(SPECIAL_VALUES_LOOP):
182 /* Check bits in range mask */
183 jl L(RANGEMASK_CHECK)
184 # LOE rbx r15 r12d r13d
192 vmovups 128(%rsp), %zmm0
196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
200 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
202 # LOE rbx r12 r13 r14 r15 zmm0
204 /* Scalar math function call
205 * to process special input
210 vmovss 64(%rsp, %r14, 4), %xmm0
212 # LOE rbx r14 r15 r12d r13d xmm0
214 vmovss %xmm0, 128(%rsp, %r14, 4)
216 /* Process special inputs in loop */
217 jmp L(SPECIAL_VALUES_LOOP)
218 # LOE rbx r15 r12d r13d
219 END(_ZGVeN16v_log1pf_skx)
221 .section .rodata, "a"
224 #ifdef __svml_slog1p_data_internal_typedef
225 typedef unsigned int VUINT32;
227 __declspec(align(64)) VUINT32 SgnMask[16][1];
228 __declspec(align(64)) VUINT32 sOne[16][1];
229 __declspec(align(64)) VUINT32 sPoly[8][16][1];
230 __declspec(align(64)) VUINT32 iHiDelta[16][1];
231 __declspec(align(64)) VUINT32 iLoRange[16][1];
232 __declspec(align(64)) VUINT32 iBrkValue[16][1];
233 __declspec(align(64)) VUINT32 iOffExpoMask[16][1];
234 __declspec(align(64)) VUINT32 sLn2[16][1];
235 } __svml_slog1p_data_internal;
237 __svml_slog1p_data_internal:
239 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
242 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
243 /* sPoly[] = SP polynomial */
245 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
246 .long 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
247 .long 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
248 .long 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
249 .long 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
250 .long 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
251 .long 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
252 .long 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
253 /* iHiDelta = SP 80000000-7f000000 */
255 .long 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000
256 /* iLoRange = SP 00800000+iHiDelta */
258 .long 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000, 0x01800000
259 /* iBrkValue = SP 2/3 */
261 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
262 /* iOffExpoMask = SP significand mask */
264 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
265 /* sLn2 = SP ln(2) */
267 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
269 .type __svml_slog1p_data_internal, @object
270 .size __svml_slog1p_data_internal, .-__svml_slog1p_data_internal