include/fpu/softfloat-macros.h

   1 /*
   2  * QEMU float support macros
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 #ifndef FPU_SOFTFLOAT_MACROS_H
  83 #define FPU_SOFTFLOAT_MACROS_H
  84
  85 #include "fpu/softfloat-types.h"
  86 #include "qemu/host-utils.h"
  87
  88 /**
  89  * shl_double: double-word merging left shift
  90  * @l: left or most-significant word
  91  * @r: right or least-significant word
  92  * @c: shift count
  93  *
  94  * Shift @l left by @c bits, shifting in bits from @r.
  95  */
  96 static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
  97 {
  98 #if defined(__x86_64__)
  99     asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
 100     return l;
 101 #else
 102     return c ? (l << c) | (r >> (64 - c)) : l;
 103 #endif
 104 }
 105
 106 /**
 107  * shr_double: double-word merging right shift
 108  * @l: left or most-significant word
 109  * @r: right or least-significant word
 110  * @c: shift count
 111  *
 112  * Shift @r right by @c bits, shifting in bits from @l.
 113  */
 114 static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
 115 {
 116 #if defined(__x86_64__)
 117     asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
 118     return r;
 119 #else
 120     return c ? (r >> c) | (l << (64 - c)) : r;
 121 #endif
 122 }
 123
 124 /*----------------------------------------------------------------------------
 125 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
 126 | bits are shifted off, they are ``jammed'' into the least significant bit of
 127 | the result by setting the least significant bit to 1.  The value of `count'
 128 | can be arbitrarily large; in particular, if `count' is greater than 32, the
 129 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 130 | The result is stored in the location pointed to by `zPtr'.
 131 *----------------------------------------------------------------------------*/
 132
 133 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
 134 {
 135     uint32_t z;
 136
 137     if ( count == 0 ) {
 138         z = a;
 139     }
 140     else if ( count < 32 ) {
 141         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
 142     }
 143     else {
 144         z = ( a != 0 );
 145     }
 146     *zPtr = z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
 152 | bits are shifted off, they are ``jammed'' into the least significant bit of
 153 | the result by setting the least significant bit to 1.  The value of `count'
 154 | can be arbitrarily large; in particular, if `count' is greater than 64, the
 155 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
 156 | The result is stored in the location pointed to by `zPtr'.
 157 *----------------------------------------------------------------------------*/
 158
 159 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
 160 {
 161     uint64_t z;
 162
 163     if ( count == 0 ) {
 164         z = a;
 165     }
 166     else if ( count < 64 ) {
 167         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
 168     }
 169     else {
 170         z = ( a != 0 );
 171     }
 172     *zPtr = z;
 173
 174 }
 175
 176 /*----------------------------------------------------------------------------
 177 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
 178 | _plus_ the number of bits given in `count'.  The shifted result is at most
 179 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
 180 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
 181 | shifted off is the most-significant bit of the extra result, and the other
 182 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
 183 | bits shifted off were all zero.  This extra result is stored in the location
 184 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
 185 |     (This routine makes more sense if `a0' and `a1' are considered to form a
 186 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
 187 | value is shifted right by the number of bits given in `count', and the
 188 | integer part of the result is returned at the location pointed to by
 189 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 190 | described above, and is returned at the location pointed to by `z1Ptr'.)
 191 *----------------------------------------------------------------------------*/
 192
 193 static inline void
 194  shift64ExtraRightJamming(
 195      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 196 {
 197     uint64_t z0, z1;
 198     int8_t negCount = ( - count ) & 63;
 199
 200     if ( count == 0 ) {
 201         z1 = a1;
 202         z0 = a0;
 203     }
 204     else if ( count < 64 ) {
 205         z1 = ( a0<<negCount ) | ( a1 != 0 );
 206         z0 = a0>>count;
 207     }
 208     else {
 209         if ( count == 64 ) {
 210             z1 = a0 | ( a1 != 0 );
 211         }
 212         else {
 213             z1 = ( ( a0 | a1 ) != 0 );
 214         }
 215         z0 = 0;
 216     }
 217     *z1Ptr = z1;
 218     *z0Ptr = z0;
 219
 220 }
 221
 222 /*----------------------------------------------------------------------------
 223 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 224 | number of bits given in `count'.  Any bits shifted off are lost.  The value
 225 | of `count' can be arbitrarily large; in particular, if `count' is greater
 226 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
 227 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 228 *----------------------------------------------------------------------------*/
 229
 230 static inline void
 231  shift128Right(
 232      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 233 {
 234     uint64_t z0, z1;
 235     int8_t negCount = ( - count ) & 63;
 236
 237     if ( count == 0 ) {
 238         z1 = a1;
 239         z0 = a0;
 240     }
 241     else if ( count < 64 ) {
 242         z1 = ( a0<<negCount ) | ( a1>>count );
 243         z0 = a0>>count;
 244     }
 245     else {
 246         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
 247         z0 = 0;
 248     }
 249     *z1Ptr = z1;
 250     *z0Ptr = z0;
 251
 252 }
 253
 254 /*----------------------------------------------------------------------------
 255 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 256 | number of bits given in `count'.  If any nonzero bits are shifted off, they
 257 | are ``jammed'' into the least significant bit of the result by setting the
 258 | least significant bit to 1.  The value of `count' can be arbitrarily large;
 259 | in particular, if `count' is greater than 128, the result will be either
 260 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 261 | nonzero.  The result is broken into two 64-bit pieces which are stored at
 262 | the locations pointed to by `z0Ptr' and `z1Ptr'.
 263 *----------------------------------------------------------------------------*/
 264
 265 static inline void
 266  shift128RightJamming(
 267      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
 268 {
 269     uint64_t z0, z1;
 270     int8_t negCount = ( - count ) & 63;
 271
 272     if ( count == 0 ) {
 273         z1 = a1;
 274         z0 = a0;
 275     }
 276     else if ( count < 64 ) {
 277         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 278         z0 = a0>>count;
 279     }
 280     else {
 281         if ( count == 64 ) {
 282             z1 = a0 | ( a1 != 0 );
 283         }
 284         else if ( count < 128 ) {
 285             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 286         }
 287         else {
 288             z1 = ( ( a0 | a1 ) != 0 );
 289         }
 290         z0 = 0;
 291     }
 292     *z1Ptr = z1;
 293     *z0Ptr = z0;
 294
 295 }
 296
 297 /*----------------------------------------------------------------------------
 298 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 299 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
 300 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 301 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 302 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
 303 | the most-significant bit of the extra result, and the other 63 bits of the
 304 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
 305 | were all zero.  This extra result is stored in the location pointed to by
 306 | `z2Ptr'.  The value of `count' can be arbitrarily large.
 307 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
 308 | to form a fixed-point value with binary point between `a1' and `a2'.  This
 309 | fixed-point value is shifted right by the number of bits given in `count',
 310 | and the integer part of the result is returned at the locations pointed to
 311 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 312 | corrupted as described above, and is returned at the location pointed to by
 313 | `z2Ptr'.)
 314 *----------------------------------------------------------------------------*/
 315
 316 static inline void
 317  shift128ExtraRightJamming(
 318      uint64_t a0,
 319      uint64_t a1,
 320      uint64_t a2,
 321      int count,
 322      uint64_t *z0Ptr,
 323      uint64_t *z1Ptr,
 324      uint64_t *z2Ptr
 325  )
 326 {
 327     uint64_t z0, z1, z2;
 328     int8_t negCount = ( - count ) & 63;
 329
 330     if ( count == 0 ) {
 331         z2 = a2;
 332         z1 = a1;
 333         z0 = a0;
 334     }
 335     else {
 336         if ( count < 64 ) {
 337             z2 = a1<<negCount;
 338             z1 = ( a0<<negCount ) | ( a1>>count );
 339             z0 = a0>>count;
 340         }
 341         else {
 342             if ( count == 64 ) {
 343                 z2 = a1;
 344                 z1 = a0;
 345             }
 346             else {
 347                 a2 |= a1;
 348                 if ( count < 128 ) {
 349                     z2 = a0<<negCount;
 350                     z1 = a0>>( count & 63 );
 351                 }
 352                 else {
 353                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 354                     z1 = 0;
 355                 }
 356             }
 357             z0 = 0;
 358         }
 359         z2 |= ( a2 != 0 );
 360     }
 361     *z2Ptr = z2;
 362     *z1Ptr = z1;
 363     *z0Ptr = z0;
 364
 365 }
 366
 367 /*----------------------------------------------------------------------------
 368 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 369 | number of bits given in `count'.  Any bits shifted off are lost.  The value
 370 | of `count' must be less than 64.  The result is broken into two 64-bit
 371 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 372 *----------------------------------------------------------------------------*/
 373
 374 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
 375                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
 376 {
 377     *z1Ptr = a1 << count;
 378     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 379 }
 380
 381 /*----------------------------------------------------------------------------
 382 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 383 | number of bits given in `count'.  Any bits shifted off are lost.  The value
 384 | of `count' may be greater than 64.  The result is broken into two 64-bit
 385 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 386 *----------------------------------------------------------------------------*/
 387
 388 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
 389                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
 390 {
 391     if (count < 64) {
 392         *z1Ptr = a1 << count;
 393         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
 394     } else {
 395         *z1Ptr = 0;
 396         *z0Ptr = a1 << (count - 64);
 397     }
 398 }
 399
 400 /*----------------------------------------------------------------------------
 401 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 402 | by the number of bits given in `count'.  Any bits shifted off are lost.
 403 | The value of `count' must be less than 64.  The result is broken into three
 404 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 405 | `z1Ptr', and `z2Ptr'.
 406 *----------------------------------------------------------------------------*/
 407
 408 static inline void
 409  shortShift192Left(
 410      uint64_t a0,
 411      uint64_t a1,
 412      uint64_t a2,
 413      int count,
 414      uint64_t *z0Ptr,
 415      uint64_t *z1Ptr,
 416      uint64_t *z2Ptr
 417  )
 418 {
 419     uint64_t z0, z1, z2;
 420     int8_t negCount;
 421
 422     z2 = a2<<count;
 423     z1 = a1<<count;
 424     z0 = a0<<count;
 425     if ( 0 < count ) {
 426         negCount = ( ( - count ) & 63 );
 427         z1 |= a2>>negCount;
 428         z0 |= a1>>negCount;
 429     }
 430     *z2Ptr = z2;
 431     *z1Ptr = z1;
 432     *z0Ptr = z0;
 433
 434 }
 435
 436 /*----------------------------------------------------------------------------
 437 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 438 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 439 | any carry out is lost.  The result is broken into two 64-bit pieces which
 440 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 441 *----------------------------------------------------------------------------*/
 442
 443 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
 444                           uint64_t *z0Ptr, uint64_t *z1Ptr)
 445 {
 446     bool c = 0;
 447     *z1Ptr = uadd64_carry(a1, b1, &c);
 448     *z0Ptr = uadd64_carry(a0, b0, &c);
 449 }
 450
 451 /*----------------------------------------------------------------------------
 452 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 453 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 454 | modulo 2^192, so any carry out is lost.  The result is broken into three
 455 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 456 | `z1Ptr', and `z2Ptr'.
 457 *----------------------------------------------------------------------------*/
 458
 459 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
 460                           uint64_t b0, uint64_t b1, uint64_t b2,
 461                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 462 {
 463     bool c = 0;
 464     *z2Ptr = uadd64_carry(a2, b2, &c);
 465     *z1Ptr = uadd64_carry(a1, b1, &c);
 466     *z0Ptr = uadd64_carry(a0, b0, &c);
 467 }
 468
 469 /*----------------------------------------------------------------------------
 470 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 471 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 472 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 473 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 474 | `z1Ptr'.
 475 *----------------------------------------------------------------------------*/
 476
 477 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
 478                           uint64_t *z0Ptr, uint64_t *z1Ptr)
 479 {
 480     bool c = 0;
 481     *z1Ptr = usub64_borrow(a1, b1, &c);
 482     *z0Ptr = usub64_borrow(a0, b0, &c);
 483 }
 484
 485 /*----------------------------------------------------------------------------
 486 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 487 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 488 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 489 | result is broken into three 64-bit pieces which are stored at the locations
 490 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 491 *----------------------------------------------------------------------------*/
 492
 493 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
 494                           uint64_t b0, uint64_t b1, uint64_t b2,
 495                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 496 {
 497     bool c = 0;
 498     *z2Ptr = usub64_borrow(a2, b2, &c);
 499     *z1Ptr = usub64_borrow(a1, b1, &c);
 500     *z0Ptr = usub64_borrow(a0, b0, &c);
 501 }
 502
 503 /*----------------------------------------------------------------------------
 504 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 505 | into two 64-bit pieces which are stored at the locations pointed to by
 506 | `z0Ptr' and `z1Ptr'.
 507 *----------------------------------------------------------------------------*/
 508
 509 static inline void
 510 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
 511 {
 512     mulu64(z1Ptr, z0Ptr, a, b);
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 517 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 518 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 519 | `z2Ptr'.
 520 *----------------------------------------------------------------------------*/
 521
 522 static inline void
 523 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
 524                 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
 525 {
 526     uint64_t z0, z1, m1;
 527
 528     mul64To128(a1, b, &m1, z2Ptr);
 529     mul64To128(a0, b, &z0, &z1);
 530     add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
 531 }
 532
 533 /*----------------------------------------------------------------------------
 534 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 535 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 536 | product.  The product is broken into four 64-bit pieces which are stored at
 537 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 538 *----------------------------------------------------------------------------*/
 539
 540 static inline void mul128To256(uint64_t a0, uint64_t a1,
 541                                uint64_t b0, uint64_t b1,
 542                                uint64_t *z0Ptr, uint64_t *z1Ptr,
 543                                uint64_t *z2Ptr, uint64_t *z3Ptr)
 544 {
 545     uint64_t z0, z1, z2;
 546     uint64_t m0, m1, m2, n1, n2;
 547
 548     mul64To128(a1, b0, &m1, &m2);
 549     mul64To128(a0, b1, &n1, &n2);
 550     mul64To128(a1, b1, &z2, z3Ptr);
 551     mul64To128(a0, b0, &z0, &z1);
 552
 553     add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
 554     add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
 555 }
 556
 557 /*----------------------------------------------------------------------------
 558 | Returns an approximation to the 64-bit integer quotient obtained by dividing
 559 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 560 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 561 | toward zero, the approximation returned lies between q and q + 2 inclusive.
 562 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 563 | unsigned integer is returned.
 564 *----------------------------------------------------------------------------*/
 565
 566 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
 567 {
 568     uint64_t b0, b1;
 569     uint64_t rem0, rem1, term0, term1;
 570     uint64_t z;
 571
 572     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
 573     b0 = b>>32;
 574     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
 575     mul64To128( b, z, &term0, &term1 );
 576     sub128( a0, a1, term0, term1, &rem0, &rem1 );
 577     while ( ( (int64_t) rem0 ) < 0 ) {
 578         z -= UINT64_C(0x100000000);
 579         b1 = b<<32;
 580         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 581     }
 582     rem0 = ( rem0<<32 ) | ( rem1>>32 );
 583     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 584     return z;
 585
 586 }
 587
 588 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
 589  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
 590  *
 591  * Licensed under the GPLv2/LGPLv3
 592  */
 593 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
 594                                   uint64_t n0, uint64_t d)
 595 {
 596 #if defined(__x86_64__)
 597     uint64_t q;
 598     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
 599     return q;
 600 #elif defined(__s390x__) && !defined(__clang__)
 601     /* Need to use a TImode type to get an even register pair for DLGR.  */
 602     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
 603     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
 604     *r = n >> 64;
 605     return n;
 606 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
 607     /* From Power ISA 2.06, programming note for divdeu.  */
 608     uint64_t q1, q2, Q, r1, r2, R;
 609     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
 610         : "=&r"(q1), "=r"(q2)
 611         : "r"(n1), "r"(n0), "r"(d));
 612     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
 613     r2 = n0 - (q2 * d);
 614     Q = q1 + q2;
 615     R = r1 + r2;
 616     if (R >= d || R < r2) { /* overflow implies R > d */
 617         Q += 1;
 618         R -= d;
 619     }
 620     *r = R;
 621     return Q;
 622 #else
 623     uint64_t d0, d1, q0, q1, r1, r0, m;
 624
 625     d0 = (uint32_t)d;
 626     d1 = d >> 32;
 627
 628     r1 = n1 % d1;
 629     q1 = n1 / d1;
 630     m = q1 * d0;
 631     r1 = (r1 << 32) | (n0 >> 32);
 632     if (r1 < m) {
 633         q1 -= 1;
 634         r1 += d;
 635         if (r1 >= d) {
 636             if (r1 < m) {
 637                 q1 -= 1;
 638                 r1 += d;
 639             }
 640         }
 641     }
 642     r1 -= m;
 643
 644     r0 = r1 % d1;
 645     q0 = r1 / d1;
 646     m = q0 * d0;
 647     r0 = (r0 << 32) | (uint32_t)n0;
 648     if (r0 < m) {
 649         q0 -= 1;
 650         r0 += d;
 651         if (r0 >= d) {
 652             if (r0 < m) {
 653                 q0 -= 1;
 654                 r0 += d;
 655             }
 656         }
 657     }
 658     r0 -= m;
 659
 660     *r = r0;
 661     return (q1 << 32) | q0;
 662 #endif
 663 }
 664
 665 /*----------------------------------------------------------------------------
 666 | Returns an approximation to the square root of the 32-bit significand given
 667 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 668 | `aExp' (the least significant bit) is 1, the integer returned approximates
 669 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 670 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 671 | case, the approximation returned lies strictly within +/-2 of the exact
 672 | value.
 673 *----------------------------------------------------------------------------*/
 674
 675 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
 676 {
 677     static const uint16_t sqrtOddAdjustments[] = {
 678         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 679         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 680     };
 681     static const uint16_t sqrtEvenAdjustments[] = {
 682         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 683         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 684     };
 685     int8_t index;
 686     uint32_t z;
 687
 688     index = ( a>>27 ) & 15;
 689     if ( aExp & 1 ) {
 690         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 691         z = ( ( a / z )<<14 ) + ( z<<15 );
 692         a >>= 1;
 693     }
 694     else {
 695         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 696         z = a / z + z;
 697         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 698         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 699     }
 700     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 701
 702 }
 703
 704 /*----------------------------------------------------------------------------
 705 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 706 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 707 | Otherwise, returns 0.
 708 *----------------------------------------------------------------------------*/
 709
 710 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 711 {
 712     return a0 == b0 && a1 == b1;
 713 }
 714
 715 /*----------------------------------------------------------------------------
 716 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 717 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 718 | Otherwise, returns 0.
 719 *----------------------------------------------------------------------------*/
 720
 721 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 722 {
 723     return a0 < b0 || (a0 == b0 && a1 <= b1);
 724 }
 725
 726 /*----------------------------------------------------------------------------
 727 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 728 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 729 | returns 0.
 730 *----------------------------------------------------------------------------*/
 731
 732 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 733 {
 734     return a0 < b0 || (a0 == b0 && a1 < b1);
 735 }
 736
 737 /*----------------------------------------------------------------------------
 738 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 739 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 740 | Otherwise, returns 0.
 741 *----------------------------------------------------------------------------*/
 742
 743 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
 744 {
 745     return a0 != b0 || a1 != b1;
 746 }
 747
 748 /*
 749  * Similarly, comparisons of 192-bit values.
 750  */
 751
 752 static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
 753                          uint64_t b0, uint64_t b1, uint64_t b2)
 754 {
 755     return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
 756 }
 757
 758 static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
 759                          uint64_t b0, uint64_t b1, uint64_t b2)
 760 {
 761     if (a0 != b0) {
 762         return a0 < b0;
 763     }
 764     if (a1 != b1) {
 765         return a1 < b1;
 766     }
 767     return a2 <= b2;
 768 }
 769
 770 static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
 771                          uint64_t b0, uint64_t b1, uint64_t b2)
 772 {
 773     if (a0 != b0) {
 774         return a0 < b0;
 775     }
 776     if (a1 != b1) {
 777         return a1 < b1;
 778     }
 779     return a2 < b2;
 780 }
 781
 782 #endif