1 /* Function expm1 vectorized with AVX-512.
2 Copyright (C) 2021-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
21 * After computing exp(x) in high-low parts, an accurate computation is performed to obtain exp(x)-1
22 * Typical exp() implementation, except that:
23 * - tables are small (16 elements), allowing for fast gathers
24 * - all arguments processed in the main path
25 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
26 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
27 * - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
32 /* Offsets for data table __svml_dexpm1_data_internal_avx512
44 #define poly_coeff7 768
45 #define poly_coeff6 832
46 #define poly_coeff5 896
47 #define poly_coeff4 960
48 #define poly_coeff3 1024
49 #define poly_coeff2 1088
54 .section .text.evex512, "ax", @progbits
55 ENTRY(_ZGVeN8v_expm1_skx)
57 cfi_def_cfa_offset(16)
63 vmovups L2E+__svml_dexpm1_data_internal_avx512(%rip), %zmm6
64 vmovups Shifter+__svml_dexpm1_data_internal_avx512(%rip), %zmm4
65 vmovups L2H+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
66 vmovups L2L+__svml_dexpm1_data_internal_avx512(%rip), %zmm5
67 vmovups Threshold+__svml_dexpm1_data_internal_avx512(%rip), %zmm3
68 vmovups poly_coeff5+__svml_dexpm1_data_internal_avx512(%rip), %zmm13
69 vmovups poly_coeff4+__svml_dexpm1_data_internal_avx512(%rip), %zmm15
72 vmovups poly_coeff7+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
74 /* set Z0=max(Z0, -128.0) */
75 vmovups ZThres+__svml_dexpm1_data_internal_avx512(%rip), %zmm8
76 vmovups poly_coeff3+__svml_dexpm1_data_internal_avx512(%rip), %zmm14
77 vmovups __svml_dexpm1_data_internal_avx512(%rip), %zmm9
80 /* 2^(52-4)*1.5 + x * log2(e) */
81 vfmadd213pd {rn-sae}, %zmm4, %zmm2, %zmm6
82 vmovups Exp_tbl_L+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
83 vcmppd $21, {sae}, %zmm3, %zmm2, %k0
85 /* Z0 ~ x*log2(e), rounded to 4 fractional bits */
86 vsubpd {rn-sae}, %zmm4, %zmm6, %zmm7
87 vpermt2pd Exp_tbl_H+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm9
88 vpermt2pd Exp_tbl_L+64+__svml_dexpm1_data_internal_avx512(%rip), %zmm6, %zmm0
89 vandpd SgnMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm2, %zmm1
91 /* R = x - Z0*log(2) */
92 vfnmadd213pd {rn-sae}, %zmm2, %zmm7, %zmm11
93 vmaxpd {sae}, %zmm8, %zmm7, %zmm10
94 vfnmadd231pd {rn-sae}, %zmm7, %zmm5, %zmm11
97 /* ensure |R|<2 even for special cases */
98 vandpd EMask+__svml_dexpm1_data_internal_avx512(%rip), %zmm11, %zmm3
99 vmovups poly_coeff6+__svml_dexpm1_data_internal_avx512(%rip), %zmm11
102 vscalefpd {rn-sae}, %zmm10, %zmm9, %zmm4
103 vfmadd231pd {rn-sae}, %zmm3, %zmm13, %zmm15
104 vfmadd231pd {rn-sae}, %zmm3, %zmm12, %zmm11
105 vmovups poly_coeff2+__svml_dexpm1_data_internal_avx512(%rip), %zmm12
106 vmulpd {rn-sae}, %zmm3, %zmm3, %zmm13
107 vfmadd231pd {rn-sae}, %zmm3, %zmm14, %zmm12
108 vfmadd213pd {rn-sae}, %zmm15, %zmm13, %zmm11
109 vfmadd213pd {rn-sae}, %zmm12, %zmm13, %zmm11
111 /* Tlr + R+ R*Poly */
112 vfmadd213pd {rn-sae}, %zmm0, %zmm13, %zmm11
115 vmovups One+__svml_dexpm1_data_internal_avx512(%rip), %zmm0
116 vaddpd {rn-sae}, %zmm3, %zmm11, %zmm14
117 vsubpd {rn-sae}, %zmm0, %zmm4, %zmm15
119 /* (Th-1)+Th*(Tlr + R+ R*Poly) */
120 vfmadd213pd {rn-sae}, %zmm15, %zmm14, %zmm4
121 vorpd %zmm1, %zmm4, %zmm0
124 /* Go to special inputs processing branch */
125 jne L(SPECIAL_VALUES_BRANCH)
126 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm2
129 * and exit the function
145 L(SPECIAL_VALUES_BRANCH):
146 vmovups %zmm2, 64(%rsp)
147 vmovups %zmm0, 128(%rsp)
148 # LOE rbx r12 r13 r14 r15 edx zmm0
151 # LOE rbx r12 r13 r14 r15 eax edx
155 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
159 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
163 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
164 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
165 # LOE rbx r15 r12d r13d
174 /* Call scalar math function */
175 jc L(SCALAR_MATH_CALL)
176 # LOE rbx r15 r12d r13d
182 L(SPECIAL_VALUES_LOOP):
186 /* Check bits in range mask */
187 jl L(RANGEMASK_CHECK)
188 # LOE rbx r15 r12d r13d
196 vmovups 128(%rsp), %zmm0
200 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
202 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
203 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
204 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
205 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
206 # LOE rbx r12 r13 r14 r15 zmm0
208 /* Scalar math fucntion call
209 * to process special input
214 vmovsd 64(%rsp, %r14, 8), %xmm0
216 # LOE rbx r14 r15 r12d r13d xmm0
218 vmovsd %xmm0, 128(%rsp, %r14, 8)
220 /* Process special inputs in loop */
221 jmp L(SPECIAL_VALUES_LOOP)
222 # LOE rbx r15 r12d r13d
223 END(_ZGVeN8v_expm1_skx)
225 .section .rodata, "a"
228 #ifdef __svml_dexpm1_data_internal_avx512_typedef
229 typedef unsigned int VUINT32;
231 __declspec(align(64)) VUINT32 Exp_tbl_H[16][2];
232 __declspec(align(64)) VUINT32 Exp_tbl_L[16][2];
233 __declspec(align(64)) VUINT32 L2E[8][2];
234 __declspec(align(64)) VUINT32 Shifter[8][2];
235 __declspec(align(64)) VUINT32 Threshold[8][2];
236 __declspec(align(64)) VUINT32 SgnMask[8][2];
237 __declspec(align(64)) VUINT32 L2H[8][2];
238 __declspec(align(64)) VUINT32 L2L[8][2];
239 __declspec(align(64)) VUINT32 ZThres[8][2];
240 __declspec(align(64)) VUINT32 EMask[8][2];
241 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
242 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
243 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
244 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
245 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
246 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
247 __declspec(align(64)) VUINT32 One[8][2];
248 } __svml_dexpm1_data_internal_avx512;
250 __svml_dexpm1_data_internal_avx512:
252 .quad 0x3ff0000000000000
253 .quad 0x3ff0b5586cf9890f
254 .quad 0x3ff172b83c7d517b
255 .quad 0x3ff2387a6e756238
256 .quad 0x3ff306fe0a31b715
257 .quad 0x3ff3dea64c123422
258 .quad 0x3ff4bfdad5362a27
259 .quad 0x3ff5ab07dd485429
260 .quad 0x3ff6a09e667f3bcd
261 .quad 0x3ff7a11473eb0187
262 .quad 0x3ff8ace5422aa0db
263 .quad 0x3ff9c49182a3f090
264 .quad 0x3ffae89f995ad3ad
265 .quad 0x3ffc199bdd85529c
266 .quad 0x3ffd5818dcfba487
267 .quad 0x3ffea4afa2a490da
270 .quad 0x0000000000000000
271 .quad 0x3c979aa65d837b6d
272 .quad 0xbc801b15eaa59348
273 .quad 0x3c968efde3a8a894
274 .quad 0x3c834d754db0abb6
275 .quad 0x3c859f48a72a4c6d
276 .quad 0x3c7690cebb7aafb0
277 .quad 0x3c9063e1e21c5409
278 .quad 0xbc93b3efbf5e2228
279 .quad 0xbc7b32dcb94da51d
280 .quad 0x3c8db72fc1f0eab4
281 .quad 0x3c71affc2b91ce27
282 .quad 0x3c8c1a7792cb3387
283 .quad 0x3c736eae30af0cb3
284 .quad 0x3c74a385a63d07a7
285 .quad 0xbc8ff7128fd391f0
288 .quad 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE, 0x3ff71547652B82FE
289 /* Shifter=2^(52-4)*1.5 */
291 .quad 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0, 0x42f8000000003ff0
294 .quad 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44, 0x40861DA04CBAFE44
297 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
298 /* L2H = log(2)_high */
300 .quad 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef, 0x3fe62e42fefa39ef
301 /* L2L = log(2)_low */
303 .quad 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f, 0x3c7abc9e3b39803f
306 .quad 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000, 0xc060000000000000
309 .quad 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff, 0xbfffffffffffffff
312 .quad 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a, 0x3f2a020410303d8a
315 .quad 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f, 0x3f56c1c38e164a2f
318 .quad 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214, 0x3f81111110865214
321 .quad 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06, 0x3fa5555554ad3d06
324 .quad 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656, 0x3fc5555555555656
327 .quad 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2, 0x3fe00000000000a2
330 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
332 .type __svml_dexpm1_data_internal_avx512, @object
333 .size __svml_dexpm1_data_internal_avx512, .-__svml_dexpm1_data_internal_avx512