sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c

   1 /* Compute sine and cosine of argument optimized with vector.
   2    Copyright (C) 2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <errno.h>
  20 #include <math.h>
  21 #include <math_private.h>
  22 #include <x86intrin.h>
  23 #include <libm-alias-float.h>
  24
  25 #define SINCOSF __sincosf_fma
  26
  27 #ifndef SINCOSF
  28 # define SINCOSF_FUNC __sincosf
  29 #else
  30 # define SINCOSF_FUNC SINCOSF
  31 #endif
  32
  33 /* PI/2 with 98 bits of accuracy.  */
  34 static const double PI_2_hi = 0x1.921fb544p+0;
  35 static const double PI_2_lo = 0x1.0b4611a626332p-34;
  36
  37 static const double SMALL = 0x1p-50; /* 2^-50.  */
  38 static const double inv_PI_4 = 0x1.45f306dc9c883p+0; /* 4/PI.  */
  39
  40 #define FLOAT_EXPONENT_SHIFT 23
  41 #define FLOAT_EXPONENT_BIAS 127
  42
  43 static const double pio2_table[] = {
  44   0 * M_PI_2,
  45   1 * M_PI_2,
  46   2 * M_PI_2,
  47   3 * M_PI_2,
  48   4 * M_PI_2,
  49   5 * M_PI_2
  50 };
  51
  52 static const double invpio4_table[] = {
  53   0x0p+0,
  54   0x1.45f306cp+0,
  55   0x1.c9c882ap-28,
  56   0x1.4fe13a8p-58,
  57   0x1.f47d4dp-85,
  58   0x1.bb81b6cp-112,
  59   0x1.4acc9ep-142,
  60   0x1.0e4107cp-169
  61 };
  62
  63 static const double ones[] = { 1.0, -1.0 };
  64
  65 /* Chebyshev constants for sin and cos, range -PI/4 - PI/4.  */
  66 static const __v2df V0 = { -0x1.5555555551cd9p-3, -0x1.ffffffffe98aep-2};
  67 static const __v2df V1 = { 0x1.1111110c2688bp-7, 0x1.55555545c50c7p-5 };
  68 static const __v2df V2 = { -0x1.a019f8b4bd1f9p-13, -0x1.6c16b348b6874p-10 };
  69 static const __v2df V3 = { 0x1.71d7264e6b5b4p-19, 0x1.a00eb9ac43ccp-16 };
  70 static const __v2df V4 = { -0x1.a947e1674b58ap-26, -0x1.23c97dd8844d7p-22 };
  71
  72 /* Chebyshev constants for sin and cos, range 2^-27 - 2^-5.  */
  73 static const __v2df VC0 = { -0x1.555555543d49dp-3, -0x1.fffffff5cc6fdp-2 };
  74 static const __v2df VC1 = { 0x1.110f475cec8c5p-7, 0x1.55514b178dac5p-5 };
  75
  76 static const __v2df v2ones = { 1.0, 1.0 };
  77
  78 /* Compute the sine and cosine values using Chebyshev polynomials where
  79    THETA is the range reduced absolute value of the input
  80    and it is less than Pi/4,
  81    N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide
  82    whether a sine or cosine approximation is more accurate and
  83    SIGNBIT is used to add the correct sign after the Chebyshev
  84    polynomial is computed.  */
  85 static void
  86 reduced_sincos (const double theta, const unsigned int n,
  87                 const unsigned int signbit, float *sinx, float *cosx)
  88 {
  89   __v2df v2x, v2sx, v2cx;
  90   const __v2df v2theta = { theta, theta };
  91   const __v2df v2theta2 = v2theta * v2theta;
  92   /* Here sinf() and cosf() are calculated using sin Chebyshev polynomial:
  93      x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
  94   v2x = V3 + v2theta2 * V4;    /* S3+x^2*S4.  */
  95   v2x = V2 + v2theta2 * v2x;   /* S2+x^2*(S3+x^2*S4).  */
  96   v2x = V1 + v2theta2 * v2x;   /* S1+x^2*(S2+x^2*(S3+x^2*S4)).  */
  97   v2x = V0 + v2theta2 * v2x;   /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))).  */
  98   v2x = v2theta2 * v2x;
  99   v2cx = v2ones + v2x;
 100   v2sx = v2theta + v2theta * v2x;
 101   /* We are operating on |x|, so we need to add back the original
 102      signbit for sinf.  */
 103   /* Determine positive or negative primary interval.  */
 104   /* Are we in the primary interval of sin or cos?  */
 105   if ((n & 2) == 0)
 106     {
 107       const __v2df v2sign =
 108         {
 109           ones[((n >> 2) & 1) ^ signbit],
 110           ones[((n + 2) >> 2) & 1]
 111         };
 112       v2cx[0] = v2sx[0];
 113       v2cx *= v2sign;
 114       __v4sf v4sx = _mm_cvtpd_ps (v2cx);
 115       *sinx = v4sx[0];
 116       *cosx = v4sx[1];
 117     }
 118   else
 119     {
 120       const __v2df v2sign =
 121         {
 122           ones[((n + 2) >> 2) & 1],
 123           ones[((n >> 2) & 1) ^ signbit]
 124         };
 125       v2cx[0] = v2sx[0];
 126       v2cx *= v2sign;
 127       __v4sf v4sx = _mm_cvtpd_ps (v2cx);
 128       *sinx = v4sx[1];
 129       *cosx = v4sx[0];
 130     }
 131 }
 132
 133 void
 134 SINCOSF_FUNC (float x, float *sinx, float *cosx)
 135 {
 136   double theta = x;
 137   double abstheta = fabs (theta);
 138   uint32_t ix, xi;
 139   GET_FLOAT_WORD (xi, x);
 140   /* |x| */
 141   ix = xi & 0x7fffffff;
 142   /* If |x|< Pi/4.  */
 143   if (ix < 0x3f490fdb)
 144     {
 145       if (ix >= 0x3d000000) /* |x| >= 2^-5.  */
 146         {
 147           __v2df v2x, v2sx, v2cx;
 148           const __v2df v2theta = { theta, theta };
 149           const __v2df v2theta2 = v2theta * v2theta;
 150           /* Chebyshev polynomial of the form for sin and cos.  */
 151           v2x = V3 + v2theta2 * V4;
 152           v2x = V2 + v2theta2 * v2x;
 153           v2x = V1 + v2theta2 * v2x;
 154           v2x = V0 + v2theta2 * v2x;
 155           v2x = v2theta2 * v2x;
 156           v2cx = v2ones + v2x;
 157           v2sx = v2theta + v2theta * v2x;
 158           v2cx[0] = v2sx[0];
 159           __v4sf v4sx = _mm_cvtpd_ps (v2cx);
 160           *sinx = v4sx[0];
 161           *cosx = v4sx[1];
 162         }
 163       else if (ix >= 0x32000000)     /* |x| >= 2^-27.  */
 164         {
 165           /* A simpler Chebyshev approximation is close enough for this range:
 166              for sin: x+x^3*(SS0+x^2*SS1)
 167              for cos: 1.0+x^2*(CC0+x^3*CC1).  */
 168           __v2df v2x, v2sx, v2cx;
 169           const __v2df v2theta = { theta, theta };
 170           const __v2df v2theta2 = v2theta * v2theta;
 171           v2x = VC0 + v2theta * v2theta2 * VC1;
 172           v2x = v2theta2 * v2x;
 173           v2cx = v2ones + v2x;
 174           v2sx = v2theta + v2theta * v2x;
 175           v2cx[0] = v2sx[0];
 176           __v4sf v4sx = _mm_cvtpd_ps (v2cx);
 177           *sinx = v4sx[0];
 178           *cosx = v4sx[1];
 179         }
 180       else
 181         {
 182           /* Handle some special cases.  */
 183           if (ix)
 184             *sinx = theta - (theta * SMALL);
 185           else
 186             *sinx = theta;
 187           *cosx = 1.0 - abstheta;
 188         }
 189     }
 190   else                          /* |x| >= Pi/4.  */
 191     {
 192       unsigned int signbit = xi >> 31;
 193       if (ix < 0x40e231d6) /* |x| < 9*Pi/4.  */
 194         {
 195           /* There are cases where FE_UPWARD rounding mode can
 196              produce a result of abstheta * inv_PI_4 == 9,
 197              where abstheta < 9pi/4, so the domain for
 198              pio2_table must go to 5 (9 / 2 + 1).  */
 199           unsigned int n = (abstheta * inv_PI_4) + 1;
 200           theta = abstheta - pio2_table[n / 2];
 201           reduced_sincos (theta, n, signbit, sinx, cosx);
 202         }
 203       else if (ix < 0x7f800000)
 204         {
 205           if (ix < 0x4b000000)     /* |x| < 2^23.  */
 206             {
 207               unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1;
 208               double x = n / 2;
 209               theta = (abstheta - x * PI_2_hi) - x * PI_2_lo;
 210               /* Argument reduction needed.  */
 211               reduced_sincos (theta, n, signbit, sinx, cosx);
 212             }
 213           else                  /* |x| >= 2^23.  */
 214             {
 215               x = fabsf (x);
 216               int exponent
 217                 = (ix >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS;
 218               exponent += 3;
 219               exponent /= 28;
 220               double a = invpio4_table[exponent] * x;
 221               double b = invpio4_table[exponent + 1] * x;
 222               double c = invpio4_table[exponent + 2] * x;
 223               double d = invpio4_table[exponent + 3] * x;
 224               uint64_t l = a;
 225               l &= ~0x7;
 226               a -= l;
 227               double e = a + b;
 228               l = e;
 229               e = a - l;
 230               if (l & 1)
 231                 {
 232                   e -= 1.0;
 233                   e += b;
 234                   e += c;
 235                   e += d;
 236                   e *= M_PI_4;
 237                   reduced_sincos (e, l + 1, signbit, sinx, cosx);
 238                 }
 239               else
 240                 {
 241                   e += b;
 242                   e += c;
 243                   e += d;
 244                   if (e <= 1.0)
 245                     {
 246                       e *= M_PI_4;
 247                       reduced_sincos (e, l + 1, signbit, sinx, cosx);
 248                     }
 249                   else
 250                     {
 251                       l++;
 252                       e -= 2.0;
 253                       e *= M_PI_4;
 254                       reduced_sincos (e, l + 1, signbit, sinx, cosx);
 255                     }
 256                 }
 257             }
 258         }
 259       else
 260         {
 261           if (ix == 0x7f800000)
 262             __set_errno (EDOM);
 263           /* sin/cos(Inf or NaN) is NaN.  */
 264           *sinx = *cosx = x - x;
 265         }
 266     }
 267 }
 268
 269 #ifndef SINCOSF
 270 libm_alias_float (__sincos, sincos)
 271 #endif