sysdeps/ieee754/dbl-64/mpa.c

   1 /*
   2  * IBM Accurate Mathematical Library
   3  * written by International Business Machines Corp.
   4  * Copyright (C) 2001-2015 Free Software Foundation, Inc.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU Lesser General Public License as published by
   8  * the Free Software Foundation; either version 2.1 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public License
  17  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 /************************************************************************/
  20 /*  MODULE_NAME: mpa.c                                                  */
  21 /*                                                                      */
  22 /*  FUNCTIONS:                                                          */
  23 /*               mcr                                                    */
  24 /*               acr                                                    */
  25 /*               cpy                                                    */
  26 /*               norm                                                   */
  27 /*               denorm                                                 */
  28 /*               mp_dbl                                                 */
  29 /*               dbl_mp                                                 */
  30 /*               add_magnitudes                                         */
  31 /*               sub_magnitudes                                         */
  32 /*               add                                                    */
  33 /*               sub                                                    */
  34 /*               mul                                                    */
  35 /*               inv                                                    */
  36 /*               dvd                                                    */
  37 /*                                                                      */
  38 /* Arithmetic functions for multiple precision numbers.                 */
  39 /* Relative errors are bounded                                          */
  40 /************************************************************************/
  41
  42
  43 #include "endian.h"
  44 #include "mpa.h"
  45 #include <sys/param.h>
  46 #include <alloca.h>
  47
  48 #ifndef SECTION
  49 # define SECTION
  50 #endif
  51
  52 #ifndef NO__CONST
  53 const mp_no __mpone = { 1, { 1.0, 1.0 } };
  54 const mp_no __mptwo = { 1, { 1.0, 2.0 } };
  55 #endif
  56
  57 #ifndef NO___ACR
  58 /* Compare mantissa of two multiple precision numbers regardless of the sign
  59    and exponent of the numbers.  */
  60 static int
  61 mcr (const mp_no *x, const mp_no *y, int p)
  62 {
  63   long i;
  64   long p2 = p;
  65   for (i = 1; i <= p2; i++)
  66     {
  67       if (X[i] == Y[i])
  68         continue;
  69       else if (X[i] > Y[i])
  70         return 1;
  71       else
  72         return -1;
  73     }
  74   return 0;
  75 }
  76
  77 /* Compare the absolute values of two multiple precision numbers.  */
  78 int
  79 __acr (const mp_no *x, const mp_no *y, int p)
  80 {
  81   long i;
  82
  83   if (X[0] == 0)
  84     {
  85       if (Y[0] == 0)
  86         i = 0;
  87       else
  88         i = -1;
  89     }
  90   else if (Y[0] == 0)
  91     i = 1;
  92   else
  93     {
  94       if (EX > EY)
  95         i = 1;
  96       else if (EX < EY)
  97         i = -1;
  98       else
  99         i = mcr (x, y, p);
 100     }
 101
 102   return i;
 103 }
 104 #endif
 105
 106 #ifndef NO___CPY
 107 /* Copy multiple precision number X into Y.  They could be the same
 108    number.  */
 109 void
 110 __cpy (const mp_no *x, mp_no *y, int p)
 111 {
 112   long i;
 113
 114   EY = EX;
 115   for (i = 0; i <= p; i++)
 116     Y[i] = X[i];
 117 }
 118 #endif
 119
 120 #ifndef NO___MP_DBL
 121 /* Convert a multiple precision number *X into a double precision
 122    number *Y, normalized case  (|x| >= 2**(-1022))).  */
 123 static void
 124 norm (const mp_no *x, double *y, int p)
 125 {
 126 # define R RADIXI
 127   long i;
 128   double c;
 129   mantissa_t a, u, v, z[5];
 130   if (p < 5)
 131     {
 132       if (p == 1)
 133         c = X[1];
 134       else if (p == 2)
 135         c = X[1] + R * X[2];
 136       else if (p == 3)
 137         c = X[1] + R * (X[2] + R * X[3]);
 138       else if (p == 4)
 139         c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
 140     }
 141   else
 142     {
 143       for (a = 1, z[1] = X[1]; z[1] < TWO23; )
 144         {
 145           a *= 2;
 146           z[1] *= 2;
 147         }
 148
 149       for (i = 2; i < 5; i++)
 150         {
 151           mantissa_store_t d, r;
 152           d = X[i] * (mantissa_store_t) a;
 153           DIV_RADIX (d, r);
 154           z[i] = r;
 155           z[i - 1] += d;
 156         }
 157
 158       u = ALIGN_DOWN_TO (z[3], TWO19);
 159       v = z[3] - u;
 160
 161       if (v == TWO18)
 162         {
 163           if (z[4] == 0)
 164             {
 165               for (i = 5; i <= p; i++)
 166                 {
 167                   if (X[i] == 0)
 168                     continue;
 169                   else
 170                     {
 171                       z[3] += 1;
 172                       break;
 173                     }
 174                 }
 175             }
 176           else
 177             z[3] += 1;
 178         }
 179
 180       c = (z[1] + R * (z[2] + R * z[3])) / a;
 181     }
 182
 183   c *= X[0];
 184
 185   for (i = 1; i < EX; i++)
 186     c *= RADIX;
 187   for (i = 1; i > EX; i--)
 188     c *= RADIXI;
 189
 190   *y = c;
 191 # undef R
 192 }
 193
 194 /* Convert a multiple precision number *X into a double precision
 195    number *Y, Denormal case  (|x| < 2**(-1022))).  */
 196 static void
 197 denorm (const mp_no *x, double *y, int p)
 198 {
 199   long i, k;
 200   long p2 = p;
 201   double c;
 202   mantissa_t u, z[5];
 203
 204 # define R RADIXI
 205   if (EX < -44 || (EX == -44 && X[1] < TWO5))
 206     {
 207       *y = 0;
 208       return;
 209     }
 210
 211   if (p2 == 1)
 212     {
 213       if (EX == -42)
 214         {
 215           z[1] = X[1] + TWO10;
 216           z[2] = 0;
 217           z[3] = 0;
 218           k = 3;
 219         }
 220       else if (EX == -43)
 221         {
 222           z[1] = TWO10;
 223           z[2] = X[1];
 224           z[3] = 0;
 225           k = 2;
 226         }
 227       else
 228         {
 229           z[1] = TWO10;
 230           z[2] = 0;
 231           z[3] = X[1];
 232           k = 1;
 233         }
 234     }
 235   else if (p2 == 2)
 236     {
 237       if (EX == -42)
 238         {
 239           z[1] = X[1] + TWO10;
 240           z[2] = X[2];
 241           z[3] = 0;
 242           k = 3;
 243         }
 244       else if (EX == -43)
 245         {
 246           z[1] = TWO10;
 247           z[2] = X[1];
 248           z[3] = X[2];
 249           k = 2;
 250         }
 251       else
 252         {
 253           z[1] = TWO10;
 254           z[2] = 0;
 255           z[3] = X[1];
 256           k = 1;
 257         }
 258     }
 259   else
 260     {
 261       if (EX == -42)
 262         {
 263           z[1] = X[1] + TWO10;
 264           z[2] = X[2];
 265           k = 3;
 266         }
 267       else if (EX == -43)
 268         {
 269           z[1] = TWO10;
 270           z[2] = X[1];
 271           k = 2;
 272         }
 273       else
 274         {
 275           z[1] = TWO10;
 276           z[2] = 0;
 277           k = 1;
 278         }
 279       z[3] = X[k];
 280     }
 281
 282   u = ALIGN_DOWN_TO (z[3], TWO5);
 283
 284   if (u == z[3])
 285     {
 286       for (i = k + 1; i <= p2; i++)
 287         {
 288           if (X[i] == 0)
 289             continue;
 290           else
 291             {
 292               z[3] += 1;
 293               break;
 294             }
 295         }
 296     }
 297
 298   c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
 299
 300   *y = c * TWOM1032;
 301 # undef R
 302 }
 303
 304 /* Convert multiple precision number *X into double precision number *Y.  The
 305    result is correctly rounded to the nearest/even.  */
 306 void
 307 __mp_dbl (const mp_no *x, double *y, int p)
 308 {
 309   if (X[0] == 0)
 310     {
 311       *y = 0;
 312       return;
 313     }
 314
 315   if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
 316     norm (x, y, p);
 317   else
 318     denorm (x, y, p);
 319 }
 320 #endif
 321
 322 /* Get the multiple precision equivalent of X into *Y.  If the precision is too
 323    small, the result is truncated.  */
 324 void
 325 SECTION
 326 __dbl_mp (double x, mp_no *y, int p)
 327 {
 328   long i, n;
 329   long p2 = p;
 330
 331   /* Sign.  */
 332   if (x == 0)
 333     {
 334       Y[0] = 0;
 335       return;
 336     }
 337   else if (x > 0)
 338     Y[0] = 1;
 339   else
 340     {
 341       Y[0] = -1;
 342       x = -x;
 343     }
 344
 345   /* Exponent.  */
 346   for (EY = 1; x >= RADIX; EY += 1)
 347     x *= RADIXI;
 348   for (; x < 1; EY -= 1)
 349     x *= RADIX;
 350
 351   /* Digits.  */
 352   n = MIN (p2, 4);
 353   for (i = 1; i <= n; i++)
 354     {
 355       INTEGER_OF (x, Y[i]);
 356       x *= RADIX;
 357     }
 358   for (; i <= p2; i++)
 359     Y[i] = 0;
 360 }
 361
 362 /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0.  The
 363    sign of the sum *Z is not changed.  X and Y may overlap but not X and Z or
 364    Y and Z.  No guard digit is used.  The result equals the exact sum,
 365    truncated.  */
 366 static void
 367 SECTION
 368 add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 369 {
 370   long i, j, k;
 371   long p2 = p;
 372   mantissa_t zk;
 373
 374   EZ = EX;
 375
 376   i = p2;
 377   j = p2 + EY - EX;
 378   k = p2 + 1;
 379
 380   if (__glibc_unlikely (j < 1))
 381     {
 382       __cpy (x, z, p);
 383       return;
 384     }
 385
 386   zk = 0;
 387
 388   for (; j > 0; i--, j--)
 389     {
 390       zk += X[i] + Y[j];
 391       if (zk >= RADIX)
 392         {
 393           Z[k--] = zk - RADIX;
 394           zk = 1;
 395         }
 396       else
 397         {
 398           Z[k--] = zk;
 399           zk = 0;
 400         }
 401     }
 402
 403   for (; i > 0; i--)
 404     {
 405       zk += X[i];
 406       if (zk >= RADIX)
 407         {
 408           Z[k--] = zk - RADIX;
 409           zk = 1;
 410         }
 411       else
 412         {
 413           Z[k--] = zk;
 414           zk = 0;
 415         }
 416     }
 417
 418   if (zk == 0)
 419     {
 420       for (i = 1; i <= p2; i++)
 421         Z[i] = Z[i + 1];
 422     }
 423   else
 424     {
 425       Z[1] = zk;
 426       EZ += 1;
 427     }
 428 }
 429
 430 /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
 431    The sign of the difference *Z is not changed.  X and Y may overlap but not X
 432    and Z or Y and Z.  One guard digit is used.  The error is less than one
 433    ULP.  */
 434 static void
 435 SECTION
 436 sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 437 {
 438   long i, j, k;
 439   long p2 = p;
 440   mantissa_t zk;
 441
 442   EZ = EX;
 443   i = p2;
 444   j = p2 + EY - EX;
 445   k = p2;
 446
 447   /* Y is too small compared to X, copy X over to the result.  */
 448   if (__glibc_unlikely (j < 1))
 449     {
 450       __cpy (x, z, p);
 451       return;
 452     }
 453
 454   /* The relevant least significant digit in Y is non-zero, so we factor it in
 455      to enhance accuracy.  */
 456   if (j < p2 && Y[j + 1] > 0)
 457     {
 458       Z[k + 1] = RADIX - Y[j + 1];
 459       zk = -1;
 460     }
 461   else
 462     zk = Z[k + 1] = 0;
 463
 464   /* Subtract and borrow.  */
 465   for (; j > 0; i--, j--)
 466     {
 467       zk += (X[i] - Y[j]);
 468       if (zk < 0)
 469         {
 470           Z[k--] = zk + RADIX;
 471           zk = -1;
 472         }
 473       else
 474         {
 475           Z[k--] = zk;
 476           zk = 0;
 477         }
 478     }
 479
 480   /* We're done with digits from Y, so it's just digits in X.  */
 481   for (; i > 0; i--)
 482     {
 483       zk += X[i];
 484       if (zk < 0)
 485         {
 486           Z[k--] = zk + RADIX;
 487           zk = -1;
 488         }
 489       else
 490         {
 491           Z[k--] = zk;
 492           zk = 0;
 493         }
 494     }
 495
 496   /* Normalize.  */
 497   for (i = 1; Z[i] == 0; i++)
 498     ;
 499   EZ = EZ - i + 1;
 500   for (k = 1; i <= p2 + 1; )
 501     Z[k++] = Z[i++];
 502   for (; k <= p2; )
 503     Z[k++] = 0;
 504 }
 505
 506 /* Add *X and *Y and store the result in *Z.  X and Y may overlap, but not X
 507    and Z or Y and Z.  One guard digit is used.  The error is less than one
 508    ULP.  */
 509 void
 510 SECTION
 511 __add (const mp_no *x, const mp_no *y, mp_no *z, int p)
 512 {
 513   int n;
 514
 515   if (X[0] == 0)
 516     {
 517       __cpy (y, z, p);
 518       return;
 519     }
 520   else if (Y[0] == 0)
 521     {
 522       __cpy (x, z, p);
 523       return;
 524     }
 525
 526   if (X[0] == Y[0])
 527     {
 528       if (__acr (x, y, p) > 0)
 529         {
 530           add_magnitudes (x, y, z, p);
 531           Z[0] = X[0];
 532         }
 533       else
 534         {
 535           add_magnitudes (y, x, z, p);
 536           Z[0] = Y[0];
 537         }
 538     }
 539   else
 540     {
 541       if ((n = __acr (x, y, p)) == 1)
 542         {
 543           sub_magnitudes (x, y, z, p);
 544           Z[0] = X[0];
 545         }
 546       else if (n == -1)
 547         {
 548           sub_magnitudes (y, x, z, p);
 549           Z[0] = Y[0];
 550         }
 551       else
 552         Z[0] = 0;
 553     }
 554 }
 555
 556 /* Subtract *Y from *X and return the result in *Z.  X and Y may overlap but
 557    not X and Z or Y and Z.  One guard digit is used.  The error is less than
 558    one ULP.  */
 559 void
 560 SECTION
 561 __sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
 562 {
 563   int n;
 564
 565   if (X[0] == 0)
 566     {
 567       __cpy (y, z, p);
 568       Z[0] = -Z[0];
 569       return;
 570     }
 571   else if (Y[0] == 0)
 572     {
 573       __cpy (x, z, p);
 574       return;
 575     }
 576
 577   if (X[0] != Y[0])
 578     {
 579       if (__acr (x, y, p) > 0)
 580         {
 581           add_magnitudes (x, y, z, p);
 582           Z[0] = X[0];
 583         }
 584       else
 585         {
 586           add_magnitudes (y, x, z, p);
 587           Z[0] = -Y[0];
 588         }
 589     }
 590   else
 591     {
 592       if ((n = __acr (x, y, p)) == 1)
 593         {
 594           sub_magnitudes (x, y, z, p);
 595           Z[0] = X[0];
 596         }
 597       else if (n == -1)
 598         {
 599           sub_magnitudes (y, x, z, p);
 600           Z[0] = -Y[0];
 601         }
 602       else
 603         Z[0] = 0;
 604     }
 605 }
 606
 607 #ifndef NO__MUL
 608 /* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
 609    and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
 610    digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
 611 void
 612 SECTION
 613 __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
 614 {
 615   long i, j, k, ip, ip2;
 616   long p2 = p;
 617   mantissa_store_t zk;
 618   const mp_no *a;
 619   mantissa_store_t *diag;
 620
 621   /* Is z=0?  */
 622   if (__glibc_unlikely (X[0] * Y[0] == 0))
 623     {
 624       Z[0] = 0;
 625       return;
 626     }
 627
 628   /* We need not iterate through all X's and Y's since it's pointless to
 629      multiply zeroes.  Here, both are zero...  */
 630   for (ip2 = p2; ip2 > 0; ip2--)
 631     if (X[ip2] != 0 || Y[ip2] != 0)
 632       break;
 633
 634   a = X[ip2] != 0 ? y : x;
 635
 636   /* ... and here, at least one of them is still zero.  */
 637   for (ip = ip2; ip > 0; ip--)
 638     if (a->d[ip] != 0)
 639       break;
 640
 641   /* The product looks like this for p = 3 (as an example):
 642
 643
 644                                 a1    a2    a3
 645                  x              b1    b2    b3
 646                  -----------------------------
 647                              a1*b3 a2*b3 a3*b3
 648                        a1*b2 a2*b2 a3*b2
 649                  a1*b1 a2*b1 a3*b1
 650
 651      So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
 652      for P >= 3.  We compute the above digits in two parts; the last P-1
 653      digits and then the first P digits.  The last P-1 digits are a sum of
 654      products of the input digits from P to P-k where K is 0 for the least
 655      significant digit and increases as we go towards the left.  The product
 656      term is of the form X[k]*X[P-k] as can be seen in the above example.
 657
 658      The first P digits are also a sum of products with the same product term,
 659      except that the sum is from 1 to k.  This is also evident from the above
 660      example.
 661
 662      Another thing that becomes evident is that only the most significant
 663      ip+ip2 digits of the result are non-zero, where ip and ip2 are the
 664      'internal precision' of the input numbers, i.e. digits after ip and ip2
 665      are all 0.  */
 666
 667   k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
 668
 669   while (k > ip + ip2 + 1)
 670     Z[k--] = 0;
 671
 672   zk = 0;
 673
 674   /* Precompute sums of diagonal elements so that we can directly use them
 675      later.  See the next comment to know we why need them.  */
 676   diag = alloca (k * sizeof (mantissa_store_t));
 677   mantissa_store_t d = 0;
 678   for (i = 1; i <= ip; i++)
 679     {
 680       d += X[i] * (mantissa_store_t) Y[i];
 681       diag[i] = d;
 682     }
 683   while (i < k)
 684     diag[i++] = d;
 685
 686   while (k > p2)
 687     {
 688       long lim = k / 2;
 689
 690       if (k % 2 == 0)
 691         /* We want to add this only once, but since we subtract it in the sum
 692            of products above, we add twice.  */
 693         zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 694
 695       for (i = k - p2, j = p2; i < j; i++, j--)
 696         zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 697
 698       zk -= diag[k - 1];
 699
 700       DIV_RADIX (zk, Z[k]);
 701       k--;
 702     }
 703
 704   /* The real deal.  Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
 705      goes from 1 -> k - 1 and j goes the same range in reverse.  To reduce the
 706      number of multiplications, we halve the range and if k is an even number,
 707      add the diagonal element X[k/2]Y[k/2].  Through the half range, we compute
 708      X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
 709
 710      This reduction tells us that we're summing two things, the first term
 711      through the half range and the negative of the sum of the product of all
 712      terms of X and Y in the full range.  i.e.
 713
 714      SUM(X[i] * Y[i]) for k terms.  This is precalculated above for each k in
 715      a single loop so that it completes in O(n) time and can hence be directly
 716      used in the loop below.  */
 717   while (k > 1)
 718     {
 719       long lim = k / 2;
 720
 721       if (k % 2 == 0)
 722         /* We want to add this only once, but since we subtract it in the sum
 723            of products above, we add twice.  */
 724         zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 725
 726       for (i = 1, j = k - 1; i < j; i++, j--)
 727         zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 728
 729       zk -= diag[k - 1];
 730
 731       DIV_RADIX (zk, Z[k]);
 732       k--;
 733     }
 734   Z[k] = zk;
 735
 736   /* Get the exponent sum into an intermediate variable.  This is a subtle
 737      optimization, where given enough registers, all operations on the exponent
 738      happen in registers and the result is written out only once into EZ.  */
 739   int e = EX + EY;
 740
 741   /* Is there a carry beyond the most significant digit?  */
 742   if (__glibc_unlikely (Z[1] == 0))
 743     {
 744       for (i = 1; i <= p2; i++)
 745         Z[i] = Z[i + 1];
 746       e--;
 747     }
 748
 749   EZ = e;
 750   Z[0] = X[0] * Y[0];
 751 }
 752 #endif
 753
 754 #ifndef NO__SQR
 755 /* Square *X and store result in *Y.  X and Y may not overlap.  For P in
 756    [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
 757    error is bounded by 1.001 ULP.  This is a faster special case of
 758    multiplication.  */
 759 void
 760 SECTION
 761 __sqr (const mp_no *x, mp_no *y, int p)
 762 {
 763   long i, j, k, ip;
 764   mantissa_store_t yk;
 765
 766   /* Is z=0?  */
 767   if (__glibc_unlikely (X[0] == 0))
 768     {
 769       Y[0] = 0;
 770       return;
 771     }
 772
 773   /* We need not iterate through all X's since it's pointless to
 774      multiply zeroes.  */
 775   for (ip = p; ip > 0; ip--)
 776     if (X[ip] != 0)
 777       break;
 778
 779   k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
 780
 781   while (k > 2 * ip + 1)
 782     Y[k--] = 0;
 783
 784   yk = 0;
 785
 786   while (k > p)
 787     {
 788       mantissa_store_t yk2 = 0;
 789       long lim = k / 2;
 790
 791       if (k % 2 == 0)
 792         yk += X[lim] * (mantissa_store_t) X[lim];
 793
 794       /* In __mul, this loop (and the one within the next while loop) run
 795          between a range to calculate the mantissa as follows:
 796
 797          Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
 798                 + X[n] * Y[k]
 799
 800          For X == Y, we can get away with summing halfway and doubling the
 801          result.  For cases where the range size is even, the mid-point needs
 802          to be added separately (above).  */
 803       for (i = k - p, j = p; i < j; i++, j--)
 804         yk2 += X[i] * (mantissa_store_t) X[j];
 805
 806       yk += 2 * yk2;
 807
 808       DIV_RADIX (yk, Y[k]);
 809       k--;
 810     }
 811
 812   while (k > 1)
 813     {
 814       mantissa_store_t yk2 = 0;
 815       long lim = k / 2;
 816
 817       if (k % 2 == 0)
 818         yk += X[lim] * (mantissa_store_t) X[lim];
 819
 820       /* Likewise for this loop.  */
 821       for (i = 1, j = k - 1; i < j; i++, j--)
 822         yk2 += X[i] * (mantissa_store_t) X[j];
 823
 824       yk += 2 * yk2;
 825
 826       DIV_RADIX (yk, Y[k]);
 827       k--;
 828     }
 829   Y[k] = yk;
 830
 831   /* Squares are always positive.  */
 832   Y[0] = 1;
 833
 834   /* Get the exponent sum into an intermediate variable.  This is a subtle
 835      optimization, where given enough registers, all operations on the exponent
 836      happen in registers and the result is written out only once into EZ.  */
 837   int e = EX * 2;
 838
 839   /* Is there a carry beyond the most significant digit?  */
 840   if (__glibc_unlikely (Y[1] == 0))
 841     {
 842       for (i = 1; i <= p; i++)
 843         Y[i] = Y[i + 1];
 844       e--;
 845     }
 846
 847   EY = e;
 848 }
 849 #endif
 850
 851 /* Invert *X and store in *Y.  Relative error bound:
 852    - For P = 2: 1.001 * R ^ (1 - P)
 853    - For P = 3: 1.063 * R ^ (1 - P)
 854    - For P > 3: 2.001 * R ^ (1 - P)
 855
 856    *X = 0 is not permissible.  */
 857 static void
 858 SECTION
 859 __inv (const mp_no *x, mp_no *y, int p)
 860 {
 861   long i;
 862   double t;
 863   mp_no z, w;
 864   static const int np1[] =
 865     { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 866     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 867   };
 868
 869   __cpy (x, &z, p);
 870   z.e = 0;
 871   __mp_dbl (&z, &t, p);
 872   t = 1 / t;
 873   __dbl_mp (t, y, p);
 874   EY -= EX;
 875
 876   for (i = 0; i < np1[p]; i++)
 877     {
 878       __cpy (y, &w, p);
 879       __mul (x, &w, y, p);
 880       __sub (&__mptwo, y, &z, p);
 881       __mul (&w, &z, y, p);
 882     }
 883 }
 884
 885 /* Divide *X by *Y and store result in *Z.  X and Y may overlap but not X and Z
 886    or Y and Z.  Relative error bound:
 887    - For P = 2: 2.001 * R ^ (1 - P)
 888    - For P = 3: 2.063 * R ^ (1 - P)
 889    - For P > 3: 3.001 * R ^ (1 - P)
 890
 891    *X = 0 is not permissible.  */
 892 void
 893 SECTION
 894 __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
 895 {
 896   mp_no w;
 897
 898   if (X[0] == 0)
 899     Z[0] = 0;
 900   else
 901     {
 902       __inv (y, &w, p);
 903       __mul (x, &w, z, p);
 904     }
 905 }