sysdeps/ieee754/dbl-64/mpa.c

   1 /*
   2  * IBM Accurate Mathematical Library
   3  * written by International Business Machines Corp.
   4  * Copyright (C) 2001-2017 Free Software Foundation, Inc.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU Lesser General Public License as published by
   8  * the Free Software Foundation; either version 2.1 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public License
  17  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19 /************************************************************************/
  20 /*  MODULE_NAME: mpa.c                                                  */
  21 /*                                                                      */
  22 /*  FUNCTIONS:                                                          */
  23 /*               mcr                                                    */
  24 /*               acr                                                    */
  25 /*               cpy                                                    */
  26 /*               norm                                                   */
  27 /*               denorm                                                 */
  28 /*               mp_dbl                                                 */
  29 /*               dbl_mp                                                 */
  30 /*               add_magnitudes                                         */
  31 /*               sub_magnitudes                                         */
  32 /*               add                                                    */
  33 /*               sub                                                    */
  34 /*               mul                                                    */
  35 /*               inv                                                    */
  36 /*               dvd                                                    */
  37 /*                                                                      */
  38 /* Arithmetic functions for multiple precision numbers.                 */
  39 /* Relative errors are bounded                                          */
  40 /************************************************************************/
  41
  42
  43 #include "endian.h"
  44 #include "mpa.h"
  45 #include <sys/param.h>
  46 #include <alloca.h>
  47
  48 #ifndef SECTION
  49 # define SECTION
  50 #endif
  51
  52 #ifndef NO__CONST
  53 const mp_no __mpone = { 1, { 1.0, 1.0 } };
  54 const mp_no __mptwo = { 1, { 1.0, 2.0 } };
  55 #endif
  56
  57 #ifndef NO___ACR
  58 /* Compare mantissa of two multiple precision numbers regardless of the sign
  59    and exponent of the numbers.  */
  60 static int
  61 mcr (const mp_no *x, const mp_no *y, int p)
  62 {
  63   long i;
  64   long p2 = p;
  65   for (i = 1; i <= p2; i++)
  66     {
  67       if (X[i] == Y[i])
  68         continue;
  69       else if (X[i] > Y[i])
  70         return 1;
  71       else
  72         return -1;
  73     }
  74   return 0;
  75 }
  76
  77 /* Compare the absolute values of two multiple precision numbers.  */
  78 int
  79 __acr (const mp_no *x, const mp_no *y, int p)
  80 {
  81   long i;
  82
  83   if (X[0] == 0)
  84     {
  85       if (Y[0] == 0)
  86         i = 0;
  87       else
  88         i = -1;
  89     }
  90   else if (Y[0] == 0)
  91     i = 1;
  92   else
  93     {
  94       if (EX > EY)
  95         i = 1;
  96       else if (EX < EY)
  97         i = -1;
  98       else
  99         i = mcr (x, y, p);
 100     }
 101
 102   return i;
 103 }
 104 #endif
 105
 106 #ifndef NO___CPY
 107 /* Copy multiple precision number X into Y.  They could be the same
 108    number.  */
 109 void
 110 __cpy (const mp_no *x, mp_no *y, int p)
 111 {
 112   long i;
 113
 114   EY = EX;
 115   for (i = 0; i <= p; i++)
 116     Y[i] = X[i];
 117 }
 118 #endif
 119
 120 #ifndef NO___MP_DBL
 121 /* Convert a multiple precision number *X into a double precision
 122    number *Y, normalized case (|x| >= 2**(-1022))).  X has precision
 123    P, which is positive.  */
 124 static void
 125 norm (const mp_no *x, double *y, int p)
 126 {
 127 # define R RADIXI
 128   long i;
 129   double c;
 130   mantissa_t a, u, v, z[5];
 131   if (p < 5)
 132     {
 133       if (p == 1)
 134         c = X[1];
 135       else if (p == 2)
 136         c = X[1] + R * X[2];
 137       else if (p == 3)
 138         c = X[1] + R * (X[2] + R * X[3]);
 139       else /* p == 4.  */
 140         c = (X[1] + R * X[2]) + R * R * (X[3] + R * X[4]);
 141     }
 142   else
 143     {
 144       for (a = 1, z[1] = X[1]; z[1] < TWO23; )
 145         {
 146           a *= 2;
 147           z[1] *= 2;
 148         }
 149
 150       for (i = 2; i < 5; i++)
 151         {
 152           mantissa_store_t d, r;
 153           d = X[i] * (mantissa_store_t) a;
 154           DIV_RADIX (d, r);
 155           z[i] = r;
 156           z[i - 1] += d;
 157         }
 158
 159       u = ALIGN_DOWN_TO (z[3], TWO19);
 160       v = z[3] - u;
 161
 162       if (v == TWO18)
 163         {
 164           if (z[4] == 0)
 165             {
 166               for (i = 5; i <= p; i++)
 167                 {
 168                   if (X[i] == 0)
 169                     continue;
 170                   else
 171                     {
 172                       z[3] += 1;
 173                       break;
 174                     }
 175                 }
 176             }
 177           else
 178             z[3] += 1;
 179         }
 180
 181       c = (z[1] + R * (z[2] + R * z[3])) / a;
 182     }
 183
 184   c *= X[0];
 185
 186   for (i = 1; i < EX; i++)
 187     c *= RADIX;
 188   for (i = 1; i > EX; i--)
 189     c *= RADIXI;
 190
 191   *y = c;
 192 # undef R
 193 }
 194
 195 /* Convert a multiple precision number *X into a double precision
 196    number *Y, Denormal case  (|x| < 2**(-1022))).  */
 197 static void
 198 denorm (const mp_no *x, double *y, int p)
 199 {
 200   long i, k;
 201   long p2 = p;
 202   double c;
 203   mantissa_t u, z[5];
 204
 205 # define R RADIXI
 206   if (EX < -44 || (EX == -44 && X[1] < TWO5))
 207     {
 208       *y = 0;
 209       return;
 210     }
 211
 212   if (p2 == 1)
 213     {
 214       if (EX == -42)
 215         {
 216           z[1] = X[1] + TWO10;
 217           z[2] = 0;
 218           z[3] = 0;
 219           k = 3;
 220         }
 221       else if (EX == -43)
 222         {
 223           z[1] = TWO10;
 224           z[2] = X[1];
 225           z[3] = 0;
 226           k = 2;
 227         }
 228       else
 229         {
 230           z[1] = TWO10;
 231           z[2] = 0;
 232           z[3] = X[1];
 233           k = 1;
 234         }
 235     }
 236   else if (p2 == 2)
 237     {
 238       if (EX == -42)
 239         {
 240           z[1] = X[1] + TWO10;
 241           z[2] = X[2];
 242           z[3] = 0;
 243           k = 3;
 244         }
 245       else if (EX == -43)
 246         {
 247           z[1] = TWO10;
 248           z[2] = X[1];
 249           z[3] = X[2];
 250           k = 2;
 251         }
 252       else
 253         {
 254           z[1] = TWO10;
 255           z[2] = 0;
 256           z[3] = X[1];
 257           k = 1;
 258         }
 259     }
 260   else
 261     {
 262       if (EX == -42)
 263         {
 264           z[1] = X[1] + TWO10;
 265           z[2] = X[2];
 266           k = 3;
 267         }
 268       else if (EX == -43)
 269         {
 270           z[1] = TWO10;
 271           z[2] = X[1];
 272           k = 2;
 273         }
 274       else
 275         {
 276           z[1] = TWO10;
 277           z[2] = 0;
 278           k = 1;
 279         }
 280       z[3] = X[k];
 281     }
 282
 283   u = ALIGN_DOWN_TO (z[3], TWO5);
 284
 285   if (u == z[3])
 286     {
 287       for (i = k + 1; i <= p2; i++)
 288         {
 289           if (X[i] == 0)
 290             continue;
 291           else
 292             {
 293               z[3] += 1;
 294               break;
 295             }
 296         }
 297     }
 298
 299   c = X[0] * ((z[1] + R * (z[2] + R * z[3])) - TWO10);
 300
 301   *y = c * TWOM1032;
 302 # undef R
 303 }
 304
 305 /* Convert multiple precision number *X into double precision number *Y.  The
 306    result is correctly rounded to the nearest/even.  */
 307 void
 308 __mp_dbl (const mp_no *x, double *y, int p)
 309 {
 310   if (X[0] == 0)
 311     {
 312       *y = 0;
 313       return;
 314     }
 315
 316   if (__glibc_likely (EX > -42 || (EX == -42 && X[1] >= TWO10)))
 317     norm (x, y, p);
 318   else
 319     denorm (x, y, p);
 320 }
 321 #endif
 322
 323 /* Get the multiple precision equivalent of X into *Y.  If the precision is too
 324    small, the result is truncated.  */
 325 void
 326 SECTION
 327 __dbl_mp (double x, mp_no *y, int p)
 328 {
 329   long i, n;
 330   long p2 = p;
 331
 332   /* Sign.  */
 333   if (x == 0)
 334     {
 335       Y[0] = 0;
 336       return;
 337     }
 338   else if (x > 0)
 339     Y[0] = 1;
 340   else
 341     {
 342       Y[0] = -1;
 343       x = -x;
 344     }
 345
 346   /* Exponent.  */
 347   for (EY = 1; x >= RADIX; EY += 1)
 348     x *= RADIXI;
 349   for (; x < 1; EY -= 1)
 350     x *= RADIX;
 351
 352   /* Digits.  */
 353   n = MIN (p2, 4);
 354   for (i = 1; i <= n; i++)
 355     {
 356       INTEGER_OF (x, Y[i]);
 357       x *= RADIX;
 358     }
 359   for (; i <= p2; i++)
 360     Y[i] = 0;
 361 }
 362
 363 /* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0.  The
 364    sign of the sum *Z is not changed.  X and Y may overlap but not X and Z or
 365    Y and Z.  No guard digit is used.  The result equals the exact sum,
 366    truncated.  */
 367 static void
 368 SECTION
 369 add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 370 {
 371   long i, j, k;
 372   long p2 = p;
 373   mantissa_t zk;
 374
 375   EZ = EX;
 376
 377   i = p2;
 378   j = p2 + EY - EX;
 379   k = p2 + 1;
 380
 381   if (__glibc_unlikely (j < 1))
 382     {
 383       __cpy (x, z, p);
 384       return;
 385     }
 386
 387   zk = 0;
 388
 389   for (; j > 0; i--, j--)
 390     {
 391       zk += X[i] + Y[j];
 392       if (zk >= RADIX)
 393         {
 394           Z[k--] = zk - RADIX;
 395           zk = 1;
 396         }
 397       else
 398         {
 399           Z[k--] = zk;
 400           zk = 0;
 401         }
 402     }
 403
 404   for (; i > 0; i--)
 405     {
 406       zk += X[i];
 407       if (zk >= RADIX)
 408         {
 409           Z[k--] = zk - RADIX;
 410           zk = 1;
 411         }
 412       else
 413         {
 414           Z[k--] = zk;
 415           zk = 0;
 416         }
 417     }
 418
 419   if (zk == 0)
 420     {
 421       for (i = 1; i <= p2; i++)
 422         Z[i] = Z[i + 1];
 423     }
 424   else
 425     {
 426       Z[1] = zk;
 427       EZ += 1;
 428     }
 429 }
 430
 431 /* Subtract the magnitudes of *X and *Y assuming that abs (*x) > abs (*y) > 0.
 432    The sign of the difference *Z is not changed.  X and Y may overlap but not X
 433    and Z or Y and Z.  One guard digit is used.  The error is less than one
 434    ULP.  */
 435 static void
 436 SECTION
 437 sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 438 {
 439   long i, j, k;
 440   long p2 = p;
 441   mantissa_t zk;
 442
 443   EZ = EX;
 444   i = p2;
 445   j = p2 + EY - EX;
 446   k = p2;
 447
 448   /* Y is too small compared to X, copy X over to the result.  */
 449   if (__glibc_unlikely (j < 1))
 450     {
 451       __cpy (x, z, p);
 452       return;
 453     }
 454
 455   /* The relevant least significant digit in Y is non-zero, so we factor it in
 456      to enhance accuracy.  */
 457   if (j < p2 && Y[j + 1] > 0)
 458     {
 459       Z[k + 1] = RADIX - Y[j + 1];
 460       zk = -1;
 461     }
 462   else
 463     zk = Z[k + 1] = 0;
 464
 465   /* Subtract and borrow.  */
 466   for (; j > 0; i--, j--)
 467     {
 468       zk += (X[i] - Y[j]);
 469       if (zk < 0)
 470         {
 471           Z[k--] = zk + RADIX;
 472           zk = -1;
 473         }
 474       else
 475         {
 476           Z[k--] = zk;
 477           zk = 0;
 478         }
 479     }
 480
 481   /* We're done with digits from Y, so it's just digits in X.  */
 482   for (; i > 0; i--)
 483     {
 484       zk += X[i];
 485       if (zk < 0)
 486         {
 487           Z[k--] = zk + RADIX;
 488           zk = -1;
 489         }
 490       else
 491         {
 492           Z[k--] = zk;
 493           zk = 0;
 494         }
 495     }
 496
 497   /* Normalize.  */
 498   for (i = 1; Z[i] == 0; i++)
 499     ;
 500   EZ = EZ - i + 1;
 501   for (k = 1; i <= p2 + 1; )
 502     Z[k++] = Z[i++];
 503   for (; k <= p2; )
 504     Z[k++] = 0;
 505 }
 506
 507 /* Add *X and *Y and store the result in *Z.  X and Y may overlap, but not X
 508    and Z or Y and Z.  One guard digit is used.  The error is less than one
 509    ULP.  */
 510 void
 511 SECTION
 512 __add (const mp_no *x, const mp_no *y, mp_no *z, int p)
 513 {
 514   int n;
 515
 516   if (X[0] == 0)
 517     {
 518       __cpy (y, z, p);
 519       return;
 520     }
 521   else if (Y[0] == 0)
 522     {
 523       __cpy (x, z, p);
 524       return;
 525     }
 526
 527   if (X[0] == Y[0])
 528     {
 529       if (__acr (x, y, p) > 0)
 530         {
 531           add_magnitudes (x, y, z, p);
 532           Z[0] = X[0];
 533         }
 534       else
 535         {
 536           add_magnitudes (y, x, z, p);
 537           Z[0] = Y[0];
 538         }
 539     }
 540   else
 541     {
 542       if ((n = __acr (x, y, p)) == 1)
 543         {
 544           sub_magnitudes (x, y, z, p);
 545           Z[0] = X[0];
 546         }
 547       else if (n == -1)
 548         {
 549           sub_magnitudes (y, x, z, p);
 550           Z[0] = Y[0];
 551         }
 552       else
 553         Z[0] = 0;
 554     }
 555 }
 556
 557 /* Subtract *Y from *X and return the result in *Z.  X and Y may overlap but
 558    not X and Z or Y and Z.  One guard digit is used.  The error is less than
 559    one ULP.  */
 560 void
 561 SECTION
 562 __sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
 563 {
 564   int n;
 565
 566   if (X[0] == 0)
 567     {
 568       __cpy (y, z, p);
 569       Z[0] = -Z[0];
 570       return;
 571     }
 572   else if (Y[0] == 0)
 573     {
 574       __cpy (x, z, p);
 575       return;
 576     }
 577
 578   if (X[0] != Y[0])
 579     {
 580       if (__acr (x, y, p) > 0)
 581         {
 582           add_magnitudes (x, y, z, p);
 583           Z[0] = X[0];
 584         }
 585       else
 586         {
 587           add_magnitudes (y, x, z, p);
 588           Z[0] = -Y[0];
 589         }
 590     }
 591   else
 592     {
 593       if ((n = __acr (x, y, p)) == 1)
 594         {
 595           sub_magnitudes (x, y, z, p);
 596           Z[0] = X[0];
 597         }
 598       else if (n == -1)
 599         {
 600           sub_magnitudes (y, x, z, p);
 601           Z[0] = -Y[0];
 602         }
 603       else
 604         Z[0] = 0;
 605     }
 606 }
 607
 608 #ifndef NO__MUL
 609 /* Multiply *X and *Y and store result in *Z.  X and Y may overlap but not X
 610    and Z or Y and Z.  For P in [1, 2, 3], the exact result is truncated to P
 611    digits.  In case P > 3 the error is bounded by 1.001 ULP.  */
 612 void
 613 SECTION
 614 __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
 615 {
 616   long i, j, k, ip, ip2;
 617   long p2 = p;
 618   mantissa_store_t zk;
 619   const mp_no *a;
 620   mantissa_store_t *diag;
 621
 622   /* Is z=0?  */
 623   if (__glibc_unlikely (X[0] * Y[0] == 0))
 624     {
 625       Z[0] = 0;
 626       return;
 627     }
 628
 629   /* We need not iterate through all X's and Y's since it's pointless to
 630      multiply zeroes.  Here, both are zero...  */
 631   for (ip2 = p2; ip2 > 0; ip2--)
 632     if (X[ip2] != 0 || Y[ip2] != 0)
 633       break;
 634
 635   a = X[ip2] != 0 ? y : x;
 636
 637   /* ... and here, at least one of them is still zero.  */
 638   for (ip = ip2; ip > 0; ip--)
 639     if (a->d[ip] != 0)
 640       break;
 641
 642   /* The product looks like this for p = 3 (as an example):
 643
 644
 645                                 a1    a2    a3
 646                  x              b1    b2    b3
 647                  -----------------------------
 648                              a1*b3 a2*b3 a3*b3
 649                        a1*b2 a2*b2 a3*b2
 650                  a1*b1 a2*b1 a3*b1
 651
 652      So our K needs to ideally be P*2, but we're limiting ourselves to P + 3
 653      for P >= 3.  We compute the above digits in two parts; the last P-1
 654      digits and then the first P digits.  The last P-1 digits are a sum of
 655      products of the input digits from P to P-k where K is 0 for the least
 656      significant digit and increases as we go towards the left.  The product
 657      term is of the form X[k]*X[P-k] as can be seen in the above example.
 658
 659      The first P digits are also a sum of products with the same product term,
 660      except that the sum is from 1 to k.  This is also evident from the above
 661      example.
 662
 663      Another thing that becomes evident is that only the most significant
 664      ip+ip2 digits of the result are non-zero, where ip and ip2 are the
 665      'internal precision' of the input numbers, i.e. digits after ip and ip2
 666      are all 0.  */
 667
 668   k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
 669
 670   while (k > ip + ip2 + 1)
 671     Z[k--] = 0;
 672
 673   zk = 0;
 674
 675   /* Precompute sums of diagonal elements so that we can directly use them
 676      later.  See the next comment to know we why need them.  */
 677   diag = alloca (k * sizeof (mantissa_store_t));
 678   mantissa_store_t d = 0;
 679   for (i = 1; i <= ip; i++)
 680     {
 681       d += X[i] * (mantissa_store_t) Y[i];
 682       diag[i] = d;
 683     }
 684   while (i < k)
 685     diag[i++] = d;
 686
 687   while (k > p2)
 688     {
 689       long lim = k / 2;
 690
 691       if (k % 2 == 0)
 692         /* We want to add this only once, but since we subtract it in the sum
 693            of products above, we add twice.  */
 694         zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 695
 696       for (i = k - p2, j = p2; i < j; i++, j--)
 697         zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 698
 699       zk -= diag[k - 1];
 700
 701       DIV_RADIX (zk, Z[k]);
 702       k--;
 703     }
 704
 705   /* The real deal.  Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
 706      goes from 1 -> k - 1 and j goes the same range in reverse.  To reduce the
 707      number of multiplications, we halve the range and if k is an even number,
 708      add the diagonal element X[k/2]Y[k/2].  Through the half range, we compute
 709      X[i] * Y[j] as (X[i] + X[j]) * (Y[i] + Y[j]) - X[i] * Y[i] - X[j] * Y[j].
 710
 711      This reduction tells us that we're summing two things, the first term
 712      through the half range and the negative of the sum of the product of all
 713      terms of X and Y in the full range.  i.e.
 714
 715      SUM(X[i] * Y[i]) for k terms.  This is precalculated above for each k in
 716      a single loop so that it completes in O(n) time and can hence be directly
 717      used in the loop below.  */
 718   while (k > 1)
 719     {
 720       long lim = k / 2;
 721
 722       if (k % 2 == 0)
 723         /* We want to add this only once, but since we subtract it in the sum
 724            of products above, we add twice.  */
 725         zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 726
 727       for (i = 1, j = k - 1; i < j; i++, j--)
 728         zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 729
 730       zk -= diag[k - 1];
 731
 732       DIV_RADIX (zk, Z[k]);
 733       k--;
 734     }
 735   Z[k] = zk;
 736
 737   /* Get the exponent sum into an intermediate variable.  This is a subtle
 738      optimization, where given enough registers, all operations on the exponent
 739      happen in registers and the result is written out only once into EZ.  */
 740   int e = EX + EY;
 741
 742   /* Is there a carry beyond the most significant digit?  */
 743   if (__glibc_unlikely (Z[1] == 0))
 744     {
 745       for (i = 1; i <= p2; i++)
 746         Z[i] = Z[i + 1];
 747       e--;
 748     }
 749
 750   EZ = e;
 751   Z[0] = X[0] * Y[0];
 752 }
 753 #endif
 754
 755 #ifndef NO__SQR
 756 /* Square *X and store result in *Y.  X and Y may not overlap.  For P in
 757    [1, 2, 3], the exact result is truncated to P digits.  In case P > 3 the
 758    error is bounded by 1.001 ULP.  This is a faster special case of
 759    multiplication.  */
 760 void
 761 SECTION
 762 __sqr (const mp_no *x, mp_no *y, int p)
 763 {
 764   long i, j, k, ip;
 765   mantissa_store_t yk;
 766
 767   /* Is z=0?  */
 768   if (__glibc_unlikely (X[0] == 0))
 769     {
 770       Y[0] = 0;
 771       return;
 772     }
 773
 774   /* We need not iterate through all X's since it's pointless to
 775      multiply zeroes.  */
 776   for (ip = p; ip > 0; ip--)
 777     if (X[ip] != 0)
 778       break;
 779
 780   k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
 781
 782   while (k > 2 * ip + 1)
 783     Y[k--] = 0;
 784
 785   yk = 0;
 786
 787   while (k > p)
 788     {
 789       mantissa_store_t yk2 = 0;
 790       long lim = k / 2;
 791
 792       if (k % 2 == 0)
 793         yk += X[lim] * (mantissa_store_t) X[lim];
 794
 795       /* In __mul, this loop (and the one within the next while loop) run
 796          between a range to calculate the mantissa as follows:
 797
 798          Z[k] = X[k] * Y[n] + X[k+1] * Y[n-1] ... + X[n-1] * Y[k+1]
 799                 + X[n] * Y[k]
 800
 801          For X == Y, we can get away with summing halfway and doubling the
 802          result.  For cases where the range size is even, the mid-point needs
 803          to be added separately (above).  */
 804       for (i = k - p, j = p; i < j; i++, j--)
 805         yk2 += X[i] * (mantissa_store_t) X[j];
 806
 807       yk += 2 * yk2;
 808
 809       DIV_RADIX (yk, Y[k]);
 810       k--;
 811     }
 812
 813   while (k > 1)
 814     {
 815       mantissa_store_t yk2 = 0;
 816       long lim = k / 2;
 817
 818       if (k % 2 == 0)
 819         yk += X[lim] * (mantissa_store_t) X[lim];
 820
 821       /* Likewise for this loop.  */
 822       for (i = 1, j = k - 1; i < j; i++, j--)
 823         yk2 += X[i] * (mantissa_store_t) X[j];
 824
 825       yk += 2 * yk2;
 826
 827       DIV_RADIX (yk, Y[k]);
 828       k--;
 829     }
 830   Y[k] = yk;
 831
 832   /* Squares are always positive.  */
 833   Y[0] = 1;
 834
 835   /* Get the exponent sum into an intermediate variable.  This is a subtle
 836      optimization, where given enough registers, all operations on the exponent
 837      happen in registers and the result is written out only once into EZ.  */
 838   int e = EX * 2;
 839
 840   /* Is there a carry beyond the most significant digit?  */
 841   if (__glibc_unlikely (Y[1] == 0))
 842     {
 843       for (i = 1; i <= p; i++)
 844         Y[i] = Y[i + 1];
 845       e--;
 846     }
 847
 848   EY = e;
 849 }
 850 #endif
 851
 852 /* Invert *X and store in *Y.  Relative error bound:
 853    - For P = 2: 1.001 * R ^ (1 - P)
 854    - For P = 3: 1.063 * R ^ (1 - P)
 855    - For P > 3: 2.001 * R ^ (1 - P)
 856
 857    *X = 0 is not permissible.  */
 858 static void
 859 SECTION
 860 __inv (const mp_no *x, mp_no *y, int p)
 861 {
 862   long i;
 863   double t;
 864   mp_no z, w;
 865   static const int np1[] =
 866     { 0, 0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 867     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 868   };
 869
 870   __cpy (x, &z, p);
 871   z.e = 0;
 872   __mp_dbl (&z, &t, p);
 873   t = 1 / t;
 874   __dbl_mp (t, y, p);
 875   EY -= EX;
 876
 877   for (i = 0; i < np1[p]; i++)
 878     {
 879       __cpy (y, &w, p);
 880       __mul (x, &w, y, p);
 881       __sub (&__mptwo, y, &z, p);
 882       __mul (&w, &z, y, p);
 883     }
 884 }
 885
 886 /* Divide *X by *Y and store result in *Z.  X and Y may overlap but not X and Z
 887    or Y and Z.  Relative error bound:
 888    - For P = 2: 2.001 * R ^ (1 - P)
 889    - For P = 3: 2.063 * R ^ (1 - P)
 890    - For P > 3: 3.001 * R ^ (1 - P)
 891
 892    *X = 0 is not permissible.  */
 893 void
 894 SECTION
 895 __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
 896 {
 897   mp_no w;
 898
 899   if (X[0] == 0)
 900     Z[0] = 0;
 901   else
 902     {
 903       __inv (y, &w, p);
 904       __mul (x, &w, z, p);
 905     }
 906 }