1 /* Function log1p vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * 1+x = 2^k*(xh + xl) is computed in high-low parts; xh in [1, 2)
23 * Get short reciprocal approximation Rcp ~ 1/xh
24 * R = (Rcp*xh - 1.0) + Rcp*xl
25 * log1p(x) = k*log(2.0) - log(Rcp) + poly(R)
26 * log(Rcp) is tabulated
31 /* Offsets for data table __svml_dlog1p_data_internal_avx512
37 #define poly_coeff9 320
38 #define poly_coeff8 384
39 #define poly_coeff7 448
40 #define poly_coeff6 512
41 #define poly_coeff5 576
42 #define poly_coeff4 640
43 #define poly_coeff3 704
44 #define poly_coeff2 768
49 .section .text.evex512, "ax", @progbits
50 ENTRY(_ZGVeN8v_log1p_skx)
52 cfi_def_cfa_offset(16)
58 vmovups One+__svml_dlog1p_data_internal_avx512(%rip), %zmm7
59 vmovups SgnMask+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
61 vaddpd {rn-sae}, %zmm9, %zmm7, %zmm11
62 vandpd %zmm14, %zmm9, %zmm8
64 /* compute 1+x as high, low parts */
65 vmaxpd {sae}, %zmm9, %zmm7, %zmm10
66 vminpd {sae}, %zmm9, %zmm7, %zmm12
68 /* GetMant(x), normalized to [1, 2) for x>=0, NaN for x<0 */
69 vgetmantpd $8, {sae}, %zmm11, %zmm6
72 vgetexppd {sae}, %zmm11, %zmm5
73 vsubpd {rn-sae}, %zmm10, %zmm11, %zmm13
75 /* DblRcp ~ 1/Mantissa */
76 vrcp14pd %zmm6, %zmm15
78 /* Start polynomial evaluation */
79 vmovups poly_coeff9+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
80 vmovups poly_coeff7+__svml_dlog1p_data_internal_avx512(%rip), %zmm11
83 vsubpd {rn-sae}, %zmm13, %zmm12, %zmm2
84 vxorpd %zmm14, %zmm5, %zmm3
86 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
87 vrndscalepd $88, {sae}, %zmm15, %zmm4
88 vmovups poly_coeff5+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
89 vmovups poly_coeff6+__svml_dlog1p_data_internal_avx512(%rip), %zmm14
90 vmovups poly_coeff3+__svml_dlog1p_data_internal_avx512(%rip), %zmm13
93 vscalefpd {rn-sae}, %zmm3, %zmm2, %zmm1
95 /* Reduced argument: R = DblRcp*(Mantissa+Xl) - 1 */
96 vfmsub213pd {rn-sae}, %zmm7, %zmm4, %zmm6
97 vmovups __svml_dlog1p_data_internal_avx512(%rip), %zmm3
101 * Prepare exponent correction: DblRcp<0.75?
103 vmovups C075+__svml_dlog1p_data_internal_avx512(%rip), %zmm2
105 /* Prepare table index */
106 vpsrlq $48, %zmm4, %zmm0
107 vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm6
108 vmovups poly_coeff8+__svml_dlog1p_data_internal_avx512(%rip), %zmm1
109 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
110 vcmppd $4, {sae}, %zmm6, %zmm6, %k0
111 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
112 vmovups poly_coeff4+__svml_dlog1p_data_internal_avx512(%rip), %zmm10
113 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
114 vmovups L2+__svml_dlog1p_data_internal_avx512(%rip), %zmm4
115 vpermt2pd Log_tbl+64+__svml_dlog1p_data_internal_avx512(%rip), %zmm0, %zmm3
117 /* add 1 to Expon if DblRcp<0.75 */
118 vaddpd {rn-sae}, %zmm7, %zmm5, %zmm5{%k1}
121 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm0
122 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm10
123 vmovups poly_coeff2+__svml_dlog1p_data_internal_avx512(%rip), %zmm12
124 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm15
125 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
126 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
128 vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm10
131 vfmadd213pd {rn-sae}, %zmm10, %zmm15, %zmm1
132 vfmadd213pd {rn-sae}, %zmm6, %zmm0, %zmm1
133 vaddpd {rn-sae}, %zmm1, %zmm3, %zmm6
134 vfmadd213pd {rn-sae}, %zmm6, %zmm4, %zmm5
135 vorpd %zmm8, %zmm5, %zmm0
138 /* Go to special inputs processing branch */
139 jne L(SPECIAL_VALUES_BRANCH)
140 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm9
143 * and exit the function
159 L(SPECIAL_VALUES_BRANCH):
160 vmovups %zmm9, 64(%rsp)
161 vmovups %zmm0, 128(%rsp)
162 # LOE rbx r12 r13 r14 r15 edx zmm0
165 # LOE rbx r12 r13 r14 r15 eax edx
169 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
170 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
173 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
174 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
177 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
179 # LOE rbx r15 r12d r13d
188 /* Call scalar math function */
189 jc L(SCALAR_MATH_CALL)
190 # LOE rbx r15 r12d r13d
196 L(SPECIAL_VALUES_LOOP):
200 /* Check bits in range mask */
201 jl L(RANGEMASK_CHECK)
202 # LOE rbx r15 r12d r13d
210 vmovups 128(%rsp), %zmm0
214 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
215 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
216 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
217 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
218 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
219 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
220 # LOE rbx r12 r13 r14 r15 zmm0
222 /* Scalar math fucntion call
223 * to process special input
228 vmovsd 64(%rsp, %r14, 8), %xmm0
230 # LOE rbx r14 r15 r12d r13d xmm0
232 vmovsd %xmm0, 128(%rsp, %r14, 8)
234 /* Process special inputs in loop */
235 jmp L(SPECIAL_VALUES_LOOP)
236 # LOE rbx r15 r12d r13d
237 END(_ZGVeN8v_log1p_skx)
239 .section .rodata, "a"
242 #ifdef __svml_dlog1p_data_internal_avx512_typedef
243 typedef unsigned int VUINT32;
245 __declspec(align(64)) VUINT32 Log_tbl[16][2];
246 __declspec(align(64)) VUINT32 One[8][2];
247 __declspec(align(64)) VUINT32 SgnMask[8][2];
248 __declspec(align(64)) VUINT32 C075[8][2];
249 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
250 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
251 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
252 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
253 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
254 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
255 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
256 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
257 __declspec(align(64)) VUINT32 L2[8][2];
258 } __svml_dlog1p_data_internal_avx512;
260 __svml_dlog1p_data_internal_avx512:
262 .quad 0x0000000000000000
263 .quad 0xbfaf0a30c01162a6
264 .quad 0xbfbe27076e2af2e6
265 .quad 0xbfc5ff3070a793d4
266 .quad 0xbfcc8ff7c79a9a22
267 .quad 0xbfd1675cababa60e
268 .quad 0xbfd4618bc21c5ec2
269 .quad 0xbfd739d7f6bbd007
270 .quad 0x3fd269621134db92
271 .quad 0x3fcf991c6cb3b379
272 .quad 0x3fca93ed3c8ad9e3
273 .quad 0x3fc5bf406b543db2
274 .quad 0x3fc1178e8227e47c
275 .quad 0x3fb9335e5d594989
276 .quad 0x3fb08598b59e3a07
277 .quad 0x3fa0415d89e74444
280 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
283 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
286 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
289 .quad 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70, 0x3fbC81CD309D7C70
292 .quad 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62, 0xbfc007357E93AF62
295 .quad 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF, 0x3fc249229CEE81EF
298 .quad 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06, 0xbfc55553FB28DB06
301 .quad 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C, 0x3fc9999999CC9F5C
304 .quad 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD, 0xbfd00000000C05BD
307 .quad 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466, 0x3fd5555555555466
310 .quad 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6, 0xbfdFFFFFFFFFFFC6
313 .quad 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF, 0x3fe62E42FEFA39EF
315 .type __svml_dlog1p_data_internal_avx512, @object
316 .size __svml_dlog1p_data_internal_avx512, .-__svml_dlog1p_data_internal_avx512