sysdeps/x86_64/fpu/multiarch/e_expf-fma.S

   1 /* FMA/AVX2 version of IEEE 754 expf.
   2    Copyright (C) 2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Short algorithm description:
  22
  23     Let K = 64 (table size).
  24          e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
  25     where
  26          x = m*log(2)/K + y,    y in [0.0..log(2)/K]
  27          m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
  28          values of 2^(j/K) are tabulated as T[j].
  29
  30          P(y) is a minimax polynomial approximation of expf(x)-1
  31          on small interval [0.0..log(2)/K].
  32
  33          P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
  34          z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
  35
  36    Special cases:
  37     expf(NaN) = NaN
  38     expf(+INF) = +INF
  39     expf(-INF) = 0
  40     expf(x) = 1 for subnormals
  41     for finite argument, only expf(0)=1 is exact
  42     expf(x) overflows if x>88.7228317260742190
  43     expf(x) underflows if x<-103.972076416015620
  44  */
  45
  46         .section .text.fma,"ax",@progbits
  47 ENTRY(__ieee754_expf_fma)
  48         /* Input: single precision x in %xmm0 */
  49         vcvtss2sd %xmm0, %xmm0, %xmm1   /* Convert x to double precision */
  50         vmovd   %xmm0, %ecx             /* Copy x */
  51         vmovsd  L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */
  52         vfmadd213sd L(DP_RD)(%rip), %xmm1, %xmm2 /* DP x*K/log(2)+RD */
  53         vmovsd  L(DP_P2)(%rip), %xmm3   /* DP P2 */
  54         movl    %ecx, %eax              /* x */
  55         andl    $0x7fffffff, %ecx       /* |x| */
  56         lea     L(DP_T)(%rip), %rsi     /* address of table T[j] */
  57         vmovsd  L(DP_P3)(%rip), %xmm4   /* DP P3 */
  58
  59         cmpl    $0x42ad496b, %ecx       /* |x|<125*log(2) ? */
  60         jae     L(special_paths_fma)
  61
  62         /* Here if |x|<125*log(2) */
  63         cmpl    $0x31800000, %ecx       /* |x|<2^(-28) ? */
  64         jb      L(small_arg_fma)
  65
  66         /* Main path: here if 2^(-28)<=|x|<125*log(2) */
  67                                                 /* %xmm2 = SP x*K/log(2)+RS */
  68         vmovd     %xmm2, %eax
  69         vsubsd    L(DP_RD)(%rip), %xmm2, %xmm2  /* DP t=round(x*K/log(2)) */
  70         movl      %eax, %edx                    /* n*K+j with trash */
  71         andl      $0x3f, %eax                   /* bits of j */
  72         vmovsd    (%rsi,%rax,8), %xmm5          /* T[j] */
  73         andl      $0xffffffc0, %edx             /* bits of n */
  74
  75         vfmadd132sd  L(DP_NLN2K)(%rip), %xmm1, %xmm2 /*  DP y=x-t*log(2)/K */
  76         vmulsd      %xmm2, %xmm2, %xmm6         /* DP z=y*y */
  77
  78
  79         vfmadd213sd L(DP_P1)(%rip), %xmm6, %xmm4 /* DP P3*z + P1 */
  80         vfmadd213sd L(DP_P0)(%rip), %xmm6, %xmm3 /* DP P2*z+P0 */
  81
  82         addl        $0x1fc0, %edx               /* bits of n + SP exponent bias */
  83         shll        $17, %edx                   /* SP 2^n */
  84         vmovd       %edx, %xmm1                 /* SP 2^n */
  85
  86         vmulsd      %xmm6, %xmm4, %xmm4         /* DP (P3*z+P1)*z */
  87
  88         vfmadd213sd %xmm4, %xmm3, %xmm2         /* DP P(Y)  (P2*z+P0)*y */
  89         vfmadd213sd %xmm5, %xmm5, %xmm2         /* DP T[j]*(P(y)+1) */
  90         vcvtsd2ss   %xmm2, %xmm2, %xmm0         /* SP T[j]*(P(y)+1) */
  91         vmulss      %xmm1, %xmm0, %xmm0         /* SP result=2^n*(T[j]*(P(y)+1)) */
  92         ret
  93
  94         .p2align        4
  95 L(small_arg_fma):
  96         /* Here if 0<=|x|<2^(-28) */
  97         vaddss  L(SP_ONE)(%rip), %xmm0, %xmm0   /* 1.0 + x */
  98         /* Return 1.0 with inexact raised, except for x==0 */
  99         ret
 100
 101         .p2align        4
 102 L(special_paths_fma):
 103         /* Here if 125*log(2)<=|x| */
 104         shrl    $31, %eax               /* Get sign bit of x, and depending on it: */
 105         lea     L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */
 106         cmpl    (%rdx,%rax,4), %ecx     /* |x|<under/overflow bound ? */
 107         jbe     L(near_under_or_overflow_fma)
 108
 109         /* Here if |x|>under/overflow bound */
 110         cmpl    $0x7f800000, %ecx       /* |x| is finite ? */
 111         jae     L(arg_inf_or_nan_fma)
 112
 113         /* Here if |x|>under/overflow bound, and x is finite */
 114         testl   %eax, %eax              /* sign of x nonzero ? */
 115         je      L(res_overflow_fma)
 116
 117         /* Here if -inf<x<underflow bound (x<0) */
 118         vmovss  L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
 119         vmulss  %xmm0, %xmm0, %xmm0     /* Return underflowed result (zero or subnormal) */
 120         ret
 121
 122         .p2align        4
 123 L(res_overflow_fma):
 124         /* Here if overflow bound<x<inf (x>0) */
 125         vmovss  L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
 126         vmulss  %xmm0, %xmm0, %xmm0     /* Return overflowed result (Inf or max normal) */
 127         ret
 128
 129         .p2align        4
 130 L(arg_inf_or_nan_fma):
 131         /* Here if |x| is Inf or NAN */
 132         jne     L(arg_nan_fma)  /* |x| is Inf ? */
 133
 134         /* Here if |x| is Inf */
 135         lea     L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */
 136         vmovss  (%rdx,%rax,4), %xmm0    /* return zero or Inf */
 137         ret
 138
 139         .p2align        4
 140 L(arg_nan_fma):
 141         /* Here if |x| is NaN */
 142         vaddss  %xmm0, %xmm0, %xmm0     /* Return x+x (raise invalid) */
 143         ret
 144
 145         .p2align        4
 146 L(near_under_or_overflow_fma):
 147         /* Here if 125*log(2)<=|x|<under/overflow bound */
 148         vmovd   %xmm2, %eax             /* bits of n*K+j with trash */
 149         vsubsd  L(DP_RD)(%rip), %xmm2, %xmm2    /* DP t=round(x*K/log(2)) */
 150         movl    %eax, %edx              /* n*K+j with trash */
 151         andl    $0x3f, %eax             /* bits of j */
 152         vmulsd  L(DP_NLN2K)(%rip),%xmm2, %xmm2/* DP -t*log(2)/K */
 153         andl    $0xffffffc0, %edx       /* bits of n */
 154         vaddsd  %xmm1, %xmm2, %xmm0     /* DP y=x-t*log(2)/K */
 155         vmulsd  %xmm0, %xmm0, %xmm2     /* DP z=y*y */
 156         addl    $0xffc0, %edx           /* bits of n + DP exponent bias */
 157         vfmadd213sd L(DP_P0)(%rip), %xmm2, %xmm3/* DP P2*z+P0 */
 158         shlq    $46, %rdx               /* DP 2^n */
 159         vfmadd213sd L(DP_P1)(%rip), %xmm2, %xmm4/* DP P3*z+P1 */
 160         vmovq   %rdx, %xmm1             /* DP 2^n */
 161         vmulsd  %xmm2, %xmm4, %xmm4     /* DP (P3*z+P1)*z */
 162         vfmadd213sd %xmm4, %xmm3, %xmm0 /* DP (P2*z+P0)*y */
 163         vmovsd  (%rsi,%rax,8), %xmm2
 164         vfmadd213sd %xmm2, %xmm2, %xmm0 /* DP T[j]*(P(y)+1) */
 165         vmulsd  %xmm1, %xmm0, %xmm0     /* DP result=2^n*(T[j]*(P(y)+1)) */
 166         vcvtsd2ss %xmm0, %xmm0, %xmm0   /* convert result to single precision */
 167         ret
 168 END(__ieee754_expf_fma)
 169
 170         .section .rodata.cst8,"aM",@progbits,8
 171         .p2align 3
 172 L(DP_RD): /* double precision 2^52+2^51 */
 173         .long   0x00000000, 0x43380000
 174         .type L(DP_RD), @object
 175         ASM_SIZE_DIRECTIVE(L(DP_RD))
 176
 177 #define __ieee754_expf __ieee754_expf_sse2
 178
 179 #undef strong_alias
 180 #define strong_alias(ignored1, ignored2)
 181
 182 #include <sysdeps/x86_64/fpu/e_expf.S>