1 /* Function exp vectorized with SSE4.
2 Copyright (C) 2014-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_d_exp_data.h"
22 .section .text.sse4, "ax", @progbits
23 ENTRY (_ZGVbN2v_exp_sse4)
25 ALGORITHM DESCRIPTION:
27 Argument representation:
28 N = rint(X*2^k/ln2) = 2^k*M+j
29 X = N*ln2/2^k + r = M*ln2 + ln2*(j/2^k) + r
30 then -ln2/2^(k+1) < r < ln2/2^(k+1)
36 exp(X) = exp(M*ln2 + ln2*(j/2^k) + r)
37 = 2^M * 2^(j/2^k) * exp(r)
38 2^M is calculated by bit manipulation
39 2^(j/2^k) is stored in table
40 exp(r) is approximated by polynomial.
42 The table lookup is skipped if k = 0. */
45 cfi_adjust_cfa_offset (8)
46 cfi_rel_offset (%rbp, 0)
48 cfi_def_cfa_register (%rbp)
52 movq __svml_dexp_data@GOTPCREL(%rip), %r8
54 /* iAbsX = (int)(lX>>32), lX = *(longlong*)&X */
55 pshufd $221, %xmm3, %xmm7
56 movups __dbInvLn2(%r8), %xmm0
60 movq __iAbsMask(%r8), %xmm5
61 movq __iDomainRange(%r8), %xmm6
63 /* iAbsX = iAbsX&iAbsMask */
66 /* iRangeMask = (iAbsX>iDomainRange) */
69 /* Mask = iRangeMask?1:0, set mask for overflow/underflow */
72 /* dN = rint(X*2^k/Ln2) */
74 movups __dbLn2hi(%r8), %xmm5
75 movups __dbLn2lo(%r8), %xmm6
76 roundpd $0, %xmm0, %xmm7
78 /* dR = X - dN*dbLn2hi, dbLn2hi is 52-8-k hi bits of ln2/2^k */
81 /* dR = dR - dN*dbLn2lo, dbLn2lo is 40..94 bits of lo part of ln2/2^k */
83 movups __dbShifter(%r8), %xmm4
85 /* dM = X*dbInvLn2+dbShifter */
90 movups __dPC2(%r8), %xmm5
92 /* exp(r) = b0+r*(b0+r*(b1+r*b2)) */
94 addpd __dPC1(%r8), %xmm5
96 movups __dPC0(%r8), %xmm6
99 movdqu __lIndexMask(%r8), %xmm2
101 /* lIndex = (*(longlong*)&dM)&lIndexMask, lIndex is the lower K bits of lM */
104 /* lM = (*(longlong*)&dM)&(~lIndexMask) */
108 /* lM = lM<<(52-K), 2^M */
111 /* table lookup for dT[j] = 2^(j/2^k) */
113 pextrw $4, %xmm1, %ecx
117 movq (%r8,%rdx), %xmm0
119 movhpd (%r8,%rcx), %xmm0
121 /* 2^(j/2^k) * exp(r) */
124 /* multiply by 2^M through integer add */
131 cfi_def_cfa_register (%rsp)
133 cfi_adjust_cfa_offset (-8)
139 movups %xmm3, 192(%rsp)
140 movups %xmm0, 256(%rsp)
145 movups %xmm8, 112(%rsp)
146 movups %xmm9, 96(%rsp)
147 movups %xmm10, 80(%rsp)
148 movups %xmm11, 64(%rsp)
149 movups %xmm12, 48(%rsp)
150 movups %xmm13, 32(%rsp)
151 movups %xmm14, 16(%rsp)
152 movups %xmm15, (%rsp)
156 cfi_offset_rel_rsp (12, 168)
159 cfi_offset_rel_rsp (13, 160)
162 cfi_offset_rel_rsp (14, 152)
165 cfi_offset_rel_rsp (15, 144)
183 movups 112(%rsp), %xmm8
184 movups 96(%rsp), %xmm9
185 movups 80(%rsp), %xmm10
186 movups 64(%rsp), %xmm11
187 movups 48(%rsp), %xmm12
188 movups 32(%rsp), %xmm13
189 movups 16(%rsp), %xmm14
190 movups (%rsp), %xmm15
201 movups 256(%rsp), %xmm0
208 movsd 200(%rsp,%r15), %xmm0
212 movsd %xmm0, 264(%rsp,%r15)
218 movsd 192(%rsp,%r15), %xmm0
222 movsd %xmm0, 256(%rsp,%r15)
225 END (_ZGVbN2v_exp_sse4)