1 /* Function log vectorized with SSE4.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include "svml_d_log_data.h"
23 ENTRY (_ZGVbN2v_log_sse4)
25 ALGORITHM DESCRIPTION:
27 log(x) = -log(Rcp) + log(Rcp*x),
28 where Rcp ~ 1/x (accuracy ~9 bits, obtained by rounding
29 HW approximation to 1+9 mantissa bits)
31 Reduced argument R=Rcp*x-1 is used to approximate log(1+R) as polynomial
33 log(Rcp) = exponent_Rcp*log(2) + log(mantissa_Rcp)
34 -log(mantissa_Rcp) is obtained from a lookup table,
35 accessed by a 9-bit index
38 cfi_adjust_cfa_offset (8)
39 cfi_rel_offset (%rbp, 0)
41 cfi_def_cfa_register (%rbp)
45 movq __svml_dlog_data@GOTPCREL(%rip), %r8
49 /* isolate exponent bits */
52 movups _ExpMask(%r8), %xmm5
54 /* preserve mantissa, set input exponent to 2^(-10) */
56 orps _Two10(%r8), %xmm5
58 /* reciprocal approximation good to at least 11 bits */
60 cmpltpd _MinNorm(%r8), %xmm3
61 cmpnlepd _MaxNorm(%r8), %xmm2
64 /* combine and get argument value range mask */
68 movups _HalfMask(%r8), %xmm2
70 /* argument reduction started: R = Mantissa*Rcp - 1 */
75 /* round reciprocal to nearest integer, will have 1+9 mantissa bits */
76 roundpd $0, %xmm4, %xmm4
79 subpd _One(%r8), %xmm2
81 movups _Threshold(%r8), %xmm2
83 /* calculate index for table lookup */
86 pshufd $221, %xmm1, %xmm7
89 /* convert biased exponent to DP format */
92 movups _poly_coeff_1(%r8), %xmm4
94 /* polynomial computation */
96 andps _Bias(%r8), %xmm2
97 orps _Bias1(%r8), %xmm2
100 Table stores -log(0.5*mantissa) for larger mantissas,
101 adjust exponent accordingly
104 addpd _poly_coeff_2(%r8), %xmm4
106 /* exponent*log(2.0) */
107 mulpd _L2(%r8), %xmm0
110 movups _poly_coeff_3(%r8), %xmm7
113 addpd _poly_coeff_4(%r8), %xmm7
117 pextrd $2, %xmm3, %ecx
121 (exponent*log(2)) + (LogRcp + (R+poly))
125 movsd _LogRcp_lookup(%r8,%rdx), %xmm1
126 movhpd _LogRcp_lookup(%r8,%rcx), %xmm1
135 cfi_def_cfa_register (%rsp)
137 cfi_adjust_cfa_offset (-8)
143 movups %xmm6, 192(%rsp)
144 movups %xmm0, 256(%rsp)
149 movups %xmm8, 112(%rsp)
150 movups %xmm9, 96(%rsp)
151 movups %xmm10, 80(%rsp)
152 movups %xmm11, 64(%rsp)
153 movups %xmm12, 48(%rsp)
154 movups %xmm13, 32(%rsp)
155 movups %xmm14, 16(%rsp)
156 movups %xmm15, (%rsp)
160 cfi_offset_rel_rsp (12, 168)
163 cfi_offset_rel_rsp (13, 160)
166 cfi_offset_rel_rsp (14, 152)
169 cfi_offset_rel_rsp (15, 144)
187 movups 112(%rsp), %xmm8
188 movups 96(%rsp), %xmm9
189 movups 80(%rsp), %xmm10
190 movups 64(%rsp), %xmm11
191 movups 48(%rsp), %xmm12
192 movups 32(%rsp), %xmm13
193 movups 16(%rsp), %xmm14
194 movups (%rsp), %xmm15
205 movups 256(%rsp), %xmm0
212 movsd 200(%rsp,%r15), %xmm0
216 movsd %xmm0, 264(%rsp,%r15)
222 movsd 192(%rsp,%r15), %xmm0
226 movsd %xmm0, 256(%rsp,%r15)
229 END (_ZGVbN2v_log_sse4)