sysdeps/x86_64/fpu/e_expf.S

   1 /* Optimized __ieee754_expf function.
   2    Copyright (C) 2012-2014 Free Software Foundation, Inc.
   3    Contributed by Intel Corporation.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* Short algorithm description:
  23  *
  24  *  Let K = 64 (table size).
  25  *       e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
  26  *  where
  27  *       x = m*log(2)/K + y,    y in [0.0..log(2)/K]
  28  *       m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
  29  *       values of 2^(j/K) are tabulated as T[j].
  30  *
  31  *       P(y) is a minimax polynomial approximation of expf(x)-1
  32  *       on small interval [0.0..log(2)/K].
  33  *
  34  *       P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
  35  *       z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
  36  *
  37  * Special cases:
  38  *  expf(NaN) = NaN
  39  *  expf(+INF) = +INF
  40  *  expf(-INF) = 0
  41  *  expf(x) = 1 for subnormals
  42  *  for finite argument, only expf(0)=1 is exact
  43  *  expf(x) overflows if x>88.7228317260742190
  44  *  expf(x) underflows if x<-103.972076416015620
  45  */
  46
  47         .text
  48 ENTRY(__ieee754_expf)
  49         /* Input: single precision x in %xmm0 */
  50         cvtss2sd        %xmm0, %xmm1    /* Convert x to double precision */
  51         movd    %xmm0, %ecx             /* Copy x */
  52         movsd   L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */
  53         movsd   L(DP_P2)(%rip), %xmm3   /* DP P2 */
  54         movl    %ecx, %eax              /* x */
  55         mulsd   %xmm1, %xmm2            /* DP x*K/log(2) */
  56         andl    $0x7fffffff, %ecx       /* |x| */
  57         lea     L(DP_T)(%rip), %rsi     /* address of table T[j] */
  58         cmpl    $0x42ad496b, %ecx       /* |x|<125*log(2) ? */
  59         movsd   L(DP_P3)(%rip), %xmm4   /* DP P3 */
  60         addsd   L(DP_RS)(%rip), %xmm2   /* DP x*K/log(2)+RS */
  61         jae     L(special_paths)
  62
  63         /* Here if |x|<125*log(2) */
  64         cmpl    $0x31800000, %ecx       /* |x|<2^(-28) ? */
  65         jb      L(small_arg)
  66
  67         /* Main path: here if 2^(-28)<=|x|<125*log(2) */
  68         cvtsd2ss        %xmm2, %xmm2    /* SP x*K/log(2)+RS */
  69         movd    %xmm2, %eax             /* bits of n*K+j with trash */
  70         subss   L(SP_RS)(%rip), %xmm2   /* SP t=round(x*K/log(2)) */
  71         movl    %eax, %edx              /* n*K+j with trash */
  72         cvtss2sd        %xmm2, %xmm2    /* DP t */
  73         andl    $0x3f, %eax             /* bits of j */
  74         mulsd   L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
  75         andl    $0xffffffc0, %edx       /* bits of n */
  76 #ifdef __AVX__
  77         vaddsd  %xmm1, %xmm2, %xmm0     /* DP y=x-t*log(2)/K */
  78         vmulsd  %xmm0, %xmm0, %xmm2     /* DP z=y*y */
  79 #else
  80         addsd   %xmm1, %xmm2            /* DP y=x-t*log(2)/K */
  81         movaps  %xmm2, %xmm0            /* DP y */
  82         mulsd   %xmm2, %xmm2            /* DP z=y*y */
  83 #endif
  84         mulsd   %xmm2, %xmm4            /* DP P3*z */
  85         addl    $0x1fc0, %edx           /* bits of n + SP exponent bias */
  86         mulsd   %xmm2, %xmm3            /* DP P2*z */
  87         shll    $17, %edx               /* SP 2^n */
  88         addsd   L(DP_P1)(%rip), %xmm4   /* DP P3*z+P1 */
  89         addsd   L(DP_P0)(%rip), %xmm3   /* DP P2*z+P0 */
  90         movd    %edx, %xmm1             /* SP 2^n */
  91         mulsd   %xmm2, %xmm4            /* DP (P3*z+P1)*z */
  92         mulsd   %xmm3, %xmm0            /* DP (P2*z+P0)*y */
  93         addsd   %xmm4, %xmm0            /* DP P(y) */
  94         mulsd   (%rsi,%rax,8), %xmm0    /* DP P(y)*T[j] */
  95         addsd   (%rsi,%rax,8), %xmm0    /* DP T[j]*(P(y)+1) */
  96         cvtsd2ss        %xmm0, %xmm0    /* SP T[j]*(P(y)+1) */
  97         mulss   %xmm1, %xmm0            /* SP result=2^n*(T[j]*(P(y)+1)) */
  98         ret
  99
 100         .p2align        4
 101 L(small_arg):
 102         /* Here if 0<=|x|<2^(-28) */
 103         addss   L(SP_ONE)(%rip), %xmm0  /* 1.0 + x */
 104         /* Return 1.0 with inexact raised, except for x==0 */
 105         ret
 106
 107         .p2align        4
 108 L(special_paths):
 109         /* Here if 125*log(2)<=|x| */
 110         shrl    $31, %eax               /* Get sign bit of x, and depending on it: */
 111         lea     L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */
 112         cmpl    (%rdx,%rax,4), %ecx     /* |x|<under/overflow bound ? */
 113         jbe     L(near_under_or_overflow)
 114
 115         /* Here if |x|>under/overflow bound */
 116         cmpl    $0x7f800000, %ecx       /* |x| is finite ? */
 117         jae     L(arg_inf_or_nan)
 118
 119         /* Here if |x|>under/overflow bound, and x is finite */
 120         testq   %rax, %rax              /* sign of x nonzero ? */
 121         je      L(res_overflow)
 122
 123         /* Here if -inf<x<underflow bound (x<0) */
 124         movss   L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
 125         mulss   %xmm0, %xmm0            /* Return underflowed result (zero or subnormal) */
 126         ret
 127
 128         .p2align        4
 129 L(res_overflow):
 130         /* Here if overflow bound<x<inf (x>0) */
 131         movss   L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
 132         mulss   %xmm0, %xmm0            /* Return overflowed result (Inf or max normal) */
 133         ret
 134
 135         .p2align        4
 136 L(arg_inf_or_nan):
 137         /* Here if |x| is Inf or NAN */
 138         jne     L(arg_nan)      /* |x| is Inf ? */
 139
 140         /* Here if |x| is Inf */
 141         lea     L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */
 142         movss   (%rdx,%rax,4), %xmm0    /* return zero or Inf */
 143         ret
 144
 145         .p2align        4
 146 L(arg_nan):
 147         /* Here if |x| is NaN */
 148         addss   %xmm0, %xmm0            /* Return x+x (raise invalid) */
 149         ret
 150
 151         .p2align        4
 152 L(near_under_or_overflow):
 153         /* Here if 125*log(2)<=|x|<under/overflow bound */
 154         cvtsd2ss        %xmm2, %xmm2    /* SP x*K/log(2)+RS */
 155         movd    %xmm2, %eax             /* bits of n*K+j with trash */
 156         subss   L(SP_RS)(%rip), %xmm2   /* SP t=round(x*K/log(2)) */
 157         movl    %eax, %edx              /* n*K+j with trash */
 158         cvtss2sd        %xmm2, %xmm2    /* DP t */
 159         andl    $0x3f, %eax             /* bits of j */
 160         mulsd   L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
 161         andl    $0xffffffc0, %edx       /* bits of n */
 162 #ifdef __AVX__
 163         vaddsd  %xmm1, %xmm2, %xmm0     /* DP y=x-t*log(2)/K */
 164         vmulsd  %xmm0, %xmm0, %xmm2     /* DP z=y*y */
 165 #else
 166         addsd   %xmm1, %xmm2            /* DP y=x-t*log(2)/K */
 167         movaps  %xmm2, %xmm0            /* DP y */
 168         mulsd   %xmm2, %xmm2            /* DP z=y*y */
 169 #endif
 170         mulsd   %xmm2, %xmm4            /* DP P3*z */
 171         addl    $0xffc0, %edx           /* bits of n + DP exponent bias */
 172         mulsd   %xmm2, %xmm3            /* DP P2*z */
 173         shlq    $46, %rdx               /* DP 2^n */
 174         addsd   L(DP_P1)(%rip), %xmm4   /* DP P3*z+P1 */
 175         addsd   L(DP_P0)(%rip), %xmm3   /* DP P2*z+P0 */
 176         movd    %rdx, %xmm1             /* DP 2^n */
 177         mulsd   %xmm2, %xmm4            /* DP (P3*z+P1)*z */
 178         mulsd   %xmm3, %xmm0            /* DP (P2*z+P0)*y */
 179         addsd   %xmm4, %xmm0            /* DP P(y) */
 180         mulsd   (%rsi,%rax,8), %xmm0    /* DP P(y)*T[j] */
 181         addsd   (%rsi,%rax,8), %xmm0    /* DP T[j]*(P(y)+1) */
 182         mulsd   %xmm1, %xmm0            /* DP result=2^n*(T[j]*(P(y)+1)) */
 183         cvtsd2ss        %xmm0, %xmm0    /* convert result to single precision */
 184         ret
 185 END(__ieee754_expf)
 186
 187         .section .rodata, "a"
 188         .p2align 3
 189 L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
 190         .long   0x00000000, 0x3ff00000
 191         .long   0x3e778061, 0x3ff02c9a
 192         .long   0xd3158574, 0x3ff059b0
 193         .long   0x18759bc8, 0x3ff08745
 194         .long   0x6cf9890f, 0x3ff0b558
 195         .long   0x32d3d1a2, 0x3ff0e3ec
 196         .long   0xd0125b51, 0x3ff11301
 197         .long   0xaea92de0, 0x3ff1429a
 198         .long   0x3c7d517b, 0x3ff172b8
 199         .long   0xeb6fcb75, 0x3ff1a35b
 200         .long   0x3168b9aa, 0x3ff1d487
 201         .long   0x88628cd6, 0x3ff2063b
 202         .long   0x6e756238, 0x3ff2387a
 203         .long   0x65e27cdd, 0x3ff26b45
 204         .long   0xf51fdee1, 0x3ff29e9d
 205         .long   0xa6e4030b, 0x3ff2d285
 206         .long   0x0a31b715, 0x3ff306fe
 207         .long   0xb26416ff, 0x3ff33c08
 208         .long   0x373aa9cb, 0x3ff371a7
 209         .long   0x34e59ff7, 0x3ff3a7db
 210         .long   0x4c123422, 0x3ff3dea6
 211         .long   0x21f72e2a, 0x3ff4160a
 212         .long   0x6061892d, 0x3ff44e08
 213         .long   0xb5c13cd0, 0x3ff486a2
 214         .long   0xd5362a27, 0x3ff4bfda
 215         .long   0x769d2ca7, 0x3ff4f9b2
 216         .long   0x569d4f82, 0x3ff5342b
 217         .long   0x36b527da, 0x3ff56f47
 218         .long   0xdd485429, 0x3ff5ab07
 219         .long   0x15ad2148, 0x3ff5e76f
 220         .long   0xb03a5585, 0x3ff6247e
 221         .long   0x82552225, 0x3ff66238
 222         .long   0x667f3bcd, 0x3ff6a09e
 223         .long   0x3c651a2f, 0x3ff6dfb2
 224         .long   0xe8ec5f74, 0x3ff71f75
 225         .long   0x564267c9, 0x3ff75feb
 226         .long   0x73eb0187, 0x3ff7a114
 227         .long   0x36cf4e62, 0x3ff7e2f3
 228         .long   0x994cce13, 0x3ff82589
 229         .long   0x9b4492ed, 0x3ff868d9
 230         .long   0x422aa0db, 0x3ff8ace5
 231         .long   0x99157736, 0x3ff8f1ae
 232         .long   0xb0cdc5e5, 0x3ff93737
 233         .long   0x9fde4e50, 0x3ff97d82
 234         .long   0x82a3f090, 0x3ff9c491
 235         .long   0x7b5de565, 0x3ffa0c66
 236         .long   0xb23e255d, 0x3ffa5503
 237         .long   0x5579fdbf, 0x3ffa9e6b
 238         .long   0x995ad3ad, 0x3ffae89f
 239         .long   0xb84f15fb, 0x3ffb33a2
 240         .long   0xf2fb5e47, 0x3ffb7f76
 241         .long   0x904bc1d2, 0x3ffbcc1e
 242         .long   0xdd85529c, 0x3ffc199b
 243         .long   0x2e57d14b, 0x3ffc67f1
 244         .long   0xdcef9069, 0x3ffcb720
 245         .long   0x4a07897c, 0x3ffd072d
 246         .long   0xdcfba487, 0x3ffd5818
 247         .long   0x03db3285, 0x3ffda9e6
 248         .long   0x337b9b5f, 0x3ffdfc97
 249         .long   0xe78b3ff6, 0x3ffe502e
 250         .long   0xa2a490da, 0x3ffea4af
 251         .long   0xee615a27, 0x3ffefa1b
 252         .long   0x5b6e4540, 0x3fff5076
 253         .long   0x819e90d8, 0x3fffa7c1
 254         .type L(DP_T), @object
 255         ASM_SIZE_DIRECTIVE(L(DP_T))
 256
 257         .section .rodata.cst8,"aM",@progbits,8
 258         .p2align 3
 259 L(DP_KLN2): /* double precision K/log(2) */
 260         .long   0x652b82fe, 0x40571547
 261         .type L(DP_KLN2), @object
 262         ASM_SIZE_DIRECTIVE(L(DP_KLN2))
 263
 264         .p2align 3
 265 L(DP_NLN2K): /* double precision -log(2)/K */
 266         .long   0xfefa39ef, 0xbf862e42
 267         .type L(DP_NLN2K), @object
 268         ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
 269
 270         .p2align 3
 271 L(DP_RS): /* double precision 2^23+2^22 */
 272         .long   0x00000000, 0x41680000
 273         .type L(DP_RS), @object
 274         ASM_SIZE_DIRECTIVE(L(DP_RS))
 275
 276         .p2align 3
 277 L(DP_P3): /* double precision polynomial coefficient P3 */
 278         .long   0xeb78fa85, 0x3fa56420
 279         .type L(DP_P3), @object
 280         ASM_SIZE_DIRECTIVE(L(DP_P3))
 281
 282         .p2align 3
 283 L(DP_P1): /* double precision polynomial coefficient P1 */
 284         .long   0x008d6118, 0x3fe00000
 285         .type L(DP_P1), @object
 286         ASM_SIZE_DIRECTIVE(L(DP_P1))
 287
 288         .p2align 3
 289 L(DP_P2): /* double precision polynomial coefficient P2 */
 290         .long   0xda752d4f, 0x3fc55550
 291         .type L(DP_P2), @object
 292         ASM_SIZE_DIRECTIVE(L(DP_P2))
 293
 294         .p2align 3
 295 L(DP_P0): /* double precision polynomial coefficient P0 */
 296         .long   0xffffe7c6, 0x3fefffff
 297         .type L(DP_P0), @object
 298         ASM_SIZE_DIRECTIVE(L(DP_P0))
 299
 300         .p2align 2
 301 L(SP_RANGE): /* single precision overflow/underflow bounds */
 302         .long   0x42b17217      /* if x>this bound, then result overflows */
 303         .long   0x42cff1b4      /* if x<this bound, then result underflows */
 304         .type L(SP_RANGE), @object
 305         ASM_SIZE_DIRECTIVE(L(SP_RANGE))
 306
 307         .p2align 2
 308 L(SP_INF_0):
 309         .long   0x7f800000      /* single precision Inf */
 310         .long   0               /* single precision zero */
 311         .type L(SP_INF_0), @object
 312         ASM_SIZE_DIRECTIVE(L(SP_INF_0))
 313
 314         .section .rodata.cst4,"aM",@progbits,4
 315         .p2align 2
 316 L(SP_RS): /* single precision 2^23+2^22 */
 317         .long   0x4b400000
 318         .type L(SP_RS), @object
 319         ASM_SIZE_DIRECTIVE(L(SP_RS))
 320
 321         .p2align 2
 322 L(SP_SMALL): /* single precision small value 2^(-100) */
 323         .long   0x0d800000
 324         .type L(SP_SMALL), @object
 325         ASM_SIZE_DIRECTIVE(L(SP_SMALL))
 326
 327         .p2align 2
 328 L(SP_LARGE): /* single precision large value 2^100 */
 329         .long   0x71800000
 330         .type L(SP_LARGE), @object
 331         ASM_SIZE_DIRECTIVE(L(SP_LARGE))
 332
 333         .p2align 2
 334 L(SP_ONE): /* single precision 1.0 */
 335         .long   0x3f800000
 336         .type L(SP_ONE), @object
 337         ASM_SIZE_DIRECTIVE(L(SP_ONE))
 338
 339 strong_alias (__ieee754_expf, __expf_finite)