usr/src/lib/libmvec/common/__vrsqrt.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29
  30 #include <sys/isa_defs.h>
  31 #include "libm_inlines.h"
  32
  33 #ifdef _LITTLE_ENDIAN
  34 #define HI(x)   *(1+(int*)x)
  35 #define LO(x)   *(unsigned*)x
  36 #else
  37 #define HI(x)   *(int*)x
  38 #define LO(x)   *(1+(unsigned*)x)
  39 #endif
  40
  41 #ifdef __RESTRICT
  42 #define restrict _Restrict
  43 #else
  44 #define restrict
  45 #endif
  46
  47 /* double rsqrt(double x)
  48  *
  49  * Method :
  50  *      1. Special cases:
  51  *              for x = NaN                             => QNaN;
  52  *              for x = +Inf                            => 0;
  53  *              for x is negative, -Inf                 => QNaN + invalid;
  54  *              for x = +0                              => +Inf + divide-by-zero;
  55  *              for x = -0                              => -Inf + divide-by-zero.
  56  *      2. Computes reciprocal square root from:
  57  *              x = m * 2**n
  58  *      Where:
  59  *              m = [0.5, 2),
  60  *              n = ((exponent + 1) & ~1).
  61  *      Then:
  62  *              rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
  63  *      2. Computes 1/sqrt(m) from:
  64  *              1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
  65  *      Where:
  66  *              m = m0 + dm,
  67  *              m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
  68  *              m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
  69  *              m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
  70  *      Then:
  71  *              1/sqrt(m0) is looked up in a table,
  72  *              1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
  73  *              1/sqrt(1 + (1/m0)*dm) is computed using approximation:
  74  *                      1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
  75  *                                              * z + a2) * z + a1) * z + a0
  76  *                      where z = [-1/128, 1/128].
  77  *
  78  * Accuracy:
  79  *      The maximum relative error for the approximating
  80  *      polynomial is 2**(-56.26).
  81  *      Maximum error observed: less than 0.563 ulp after 1.500.000.000
  82  *      results.
  83  */
  84
  85 extern double sqrt (double);
  86 extern const double __vlibm_TBL_rsqrt[];
  87
  88 static void
  89 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
  90
  91 #pragma no_inline(__vrsqrt_n)
  92
  93 #define RETURN(ret)                                             \
  94 {                                                               \
  95         *py = (ret);                                            \
  96         py += stridey;                                          \
  97         if (n_n == 0)                                           \
  98         {                                                       \
  99                 spx = px; spy = py;                             \
 100                 hx = HI(px);                                    \
 101                 continue;                                       \
 102         }                                                       \
 103         n--;                                                    \
 104         break;                                                  \
 105 }
 106
 107 static const double
 108         DONE = 1.0,
 109         K1 = -5.00000000000005209867e-01,
 110         K2 =  3.75000000000004884257e-01,
 111         K3 = -3.12499999317136886551e-01,
 112         K4 =  2.73437499359815081532e-01,
 113         K5 = -2.46116125605037803130e-01,
 114         K6 =  2.25606914648617522896e-01;
 115
 116 void
 117 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
 118 {
 119         double          *spx, *spy;
 120         int             ax, lx, hx, n_n;
 121         double          res;
 122
 123         while (n > 1)
 124         {
 125                 n_n = 0;
 126                 spx = px;
 127                 spy = py;
 128                 hx = HI(px);
 129                 for (; n > 1 ; n--)
 130                 {
 131                         px += stridex;
 132                         if (hx >= 0x7ff00000)           /* X = NaN or Inf       */
 133                         {
 134                                 res = *(px - stridex);
 135                                 RETURN (DONE / res)
 136                         }
 137
 138                         py += stridey;
 139
 140                         if (hx < 0x00100000)            /* X = denormal, zero or negative       */
 141                         {
 142                                 py -= stridey;
 143                                 ax = hx & 0x7fffffff;
 144                                 lx = LO((px - stridex));
 145                                 res = *(px - stridex);
 146
 147                                 if ((ax | lx) == 0)     /* |X| = zero   */
 148                                 {
 149                                         RETURN (DONE / res)
 150                                 }
 151                                 else if (hx >= 0)       /* X = denormal */
 152                                 {
 153                                         double          res_c0, dsqrt_exp0;
 154                                         int             ind0, sqrt_exp0;
 155                                         double          xx0, dexp_hi0, dexp_lo0;
 156                                         int             hx0, resh0, res_ch0;
 157
 158                                         res = *(long long*)&res;
 159
 160                                         hx0 = HI(&res);
 161                                         sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
 162                                         ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 163
 164                                         resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 165                                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 166                                         HI(&res) = resh0;
 167                                         HI(&res_c0) = res_ch0;
 168                                         LO(&res_c0) = 0;
 169
 170                                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 171                                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 172                                         xx0 = dexp_hi0 * dexp_hi0;
 173                                         xx0 = (res - res_c0) * xx0;
 174                                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 175
 176                                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 177
 178                                         HI(&dsqrt_exp0) = sqrt_exp0;
 179                                         LO(&dsqrt_exp0) = 0;
 180                                         res *= dsqrt_exp0;
 181
 182                                         RETURN (res)
 183                                 }
 184                                 else    /* X = negative */
 185                                 {
 186                                         RETURN (sqrt(res))
 187                                 }
 188                         }
 189                         n_n++;
 190                         hx = HI(px);
 191                 }
 192                 if (n_n > 0)
 193                         __vrsqrt_n(n_n, spx, stridex, spy, stridey);
 194         }
 195         if (n > 0)
 196         {
 197                 hx = HI(px);
 198
 199                 if (hx >= 0x7ff00000)           /* X = NaN or Inf       */
 200                 {
 201                         res = *px;
 202                         *py = DONE / res;
 203                 }
 204                 else if (hx < 0x00100000)       /* X = denormal, zero or negative       */
 205                 {
 206                         ax = hx & 0x7fffffff;
 207                         lx = LO(px);
 208                         res = *px;
 209
 210                         if ((ax | lx) == 0)     /* |X| = zero   */
 211                         {
 212                                 *py = DONE / res;
 213                         }
 214                         else if (hx >= 0)       /* X = denormal */
 215                         {
 216                                 double          res_c0, dsqrt_exp0;
 217                                 int             ind0, sqrt_exp0;
 218                                 double          xx0, dexp_hi0, dexp_lo0;
 219                                 int             hx0, resh0, res_ch0;
 220
 221                                 res = *(long long*)&res;
 222
 223                                 hx0 = HI(&res);
 224                                 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
 225                                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 226
 227                                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 228                                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 229                                 HI(&res) = resh0;
 230                                 HI(&res_c0) = res_ch0;
 231                                 LO(&res_c0) = 0;
 232
 233                                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 234                                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 235                                 xx0 = dexp_hi0 * dexp_hi0;
 236                                 xx0 = (res - res_c0) * xx0;
 237                                 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 238
 239                                 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 240
 241                                 HI(&dsqrt_exp0) = sqrt_exp0;
 242                                 LO(&dsqrt_exp0) = 0;
 243                                 res *= dsqrt_exp0;
 244
 245                                 *py = res;
 246                         }
 247                         else    /* X = negative */
 248                         {
 249                                 *py = sqrt(res);
 250                         }
 251                 }
 252                 else
 253                 {
 254                         double          res_c0, dsqrt_exp0;
 255                         int             ind0, sqrt_exp0;
 256                         double          xx0, dexp_hi0, dexp_lo0;
 257                         int             resh0, res_ch0;
 258
 259                         sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
 260                         ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
 261
 262                         resh0 = (hx & 0x001fffff) | 0x3fe00000;
 263                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 264                         HI(&res) = resh0;
 265                         LO(&res) = LO(px);
 266                         HI(&res_c0) = res_ch0;
 267                         LO(&res_c0) = 0;
 268
 269                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 270                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 271                         xx0 = dexp_hi0 * dexp_hi0;
 272                         xx0 = (res - res_c0) * xx0;
 273                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 274
 275                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 276
 277                         HI(&dsqrt_exp0) = sqrt_exp0;
 278                         LO(&dsqrt_exp0) = 0;
 279                         res *= dsqrt_exp0;
 280
 281                         *py = res;
 282                 }
 283         }
 284 }
 285
 286 static void
 287 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
 288 {
 289         double          res0, res_c0, dsqrt_exp0;
 290         double          res1, res_c1, dsqrt_exp1;
 291         double          res2, res_c2, dsqrt_exp2;
 292         int             ind0, sqrt_exp0;
 293         int             ind1, sqrt_exp1;
 294         int             ind2, sqrt_exp2;
 295         double          xx0, dexp_hi0, dexp_lo0;
 296         double          xx1, dexp_hi1, dexp_lo1;
 297         double          xx2, dexp_hi2, dexp_lo2;
 298         int             hx0, resh0, res_ch0;
 299         int             hx1, resh1, res_ch1;
 300         int             hx2, resh2, res_ch2;
 301
 302         LO(&dsqrt_exp0) = 0;
 303         LO(&dsqrt_exp1) = 0;
 304         LO(&dsqrt_exp2) = 0;
 305         LO(&res_c0) = 0;
 306         LO(&res_c1) = 0;
 307         LO(&res_c2) = 0;
 308
 309         for(; n > 2 ; n -= 3)
 310         {
 311                 hx0 = HI(px);
 312                 LO(&res0) = LO(px);
 313                 px += stridex;
 314
 315                 hx1 = HI(px);
 316                 LO(&res1) = LO(px);
 317                 px += stridex;
 318
 319                 hx2 = HI(px);
 320                 LO(&res2) = LO(px);
 321                 px += stridex;
 322
 323                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
 324                 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
 325                 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
 326                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 327                 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
 328                 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
 329
 330                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 331                 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
 332                 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
 333                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 334                 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
 335                 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
 336                 HI(&res0) = resh0;
 337                 HI(&res1) = resh1;
 338                 HI(&res2) = resh2;
 339                 HI(&res_c0) = res_ch0;
 340                 HI(&res_c1) = res_ch1;
 341                 HI(&res_c2) = res_ch2;
 342
 343                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 344                 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
 345                 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
 346                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 347                 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
 348                 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
 349                 xx0 = dexp_hi0 * dexp_hi0;
 350                 xx1 = dexp_hi1 * dexp_hi1;
 351                 xx2 = dexp_hi2 * dexp_hi2;
 352                 xx0 = (res0 - res_c0) * xx0;
 353                 xx1 = (res1 - res_c1) * xx1;
 354                 xx2 = (res2 - res_c2) * xx2;
 355                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 356                 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
 357                 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
 358
 359                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
 360                 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
 361                 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
 362
 363                 HI(&dsqrt_exp0) = sqrt_exp0;
 364                 HI(&dsqrt_exp1) = sqrt_exp1;
 365                 HI(&dsqrt_exp2) = sqrt_exp2;
 366                 res0 *= dsqrt_exp0;
 367                 res1 *= dsqrt_exp1;
 368                 res2 *= dsqrt_exp2;
 369
 370                 *py = res0;
 371                 py += stridey;
 372
 373                 *py = res1;
 374                 py += stridey;
 375
 376                 *py = res2;
 377                 py += stridey;
 378         }
 379
 380         for(; n > 0 ; n--)
 381         {
 382                 hx0 = HI(px);
 383
 384                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
 385                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 386
 387                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 388                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 389                 HI(&res0) = resh0;
 390                 LO(&res0) = LO(px);
 391                 HI(&res_c0) = res_ch0;
 392                 LO(&res_c0) = 0;
 393
 394                 px += stridex;
 395
 396                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 397                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 398                 xx0 = dexp_hi0 * dexp_hi0;
 399                 xx0 = (res0 - res_c0) * xx0;
 400                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 401
 402                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
 403
 404                 HI(&dsqrt_exp0) = sqrt_exp0;
 405                 LO(&dsqrt_exp0) = 0;
 406                 res0 *= dsqrt_exp0;
 407
 408                 *py = res0;
 409                 py += stridey;
 410         }
 411 }