fpu/softfloat-macros.h

   1 /*
   2  * QEMU float support macros
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  10 Arithmetic Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /*----------------------------------------------------------------------------
  39 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
  40 | bits are shifted off, they are ``jammed'' into the least significant bit of
  41 | the result by setting the least significant bit to 1.  The value of `count'
  42 | can be arbitrarily large; in particular, if `count' is greater than 32, the
  43 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  44 | The result is stored in the location pointed to by `zPtr'.
  45 *----------------------------------------------------------------------------*/
  46
  47 INLINE void shift32RightJamming( uint32_t a, int16 count, uint32_t *zPtr )
  48 {
  49     uint32_t z;
  50
  51     if ( count == 0 ) {
  52         z = a;
  53     }
  54     else if ( count < 32 ) {
  55         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
  56     }
  57     else {
  58         z = ( a != 0 );
  59     }
  60     *zPtr = z;
  61
  62 }
  63
  64 /*----------------------------------------------------------------------------
  65 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
  66 | bits are shifted off, they are ``jammed'' into the least significant bit of
  67 | the result by setting the least significant bit to 1.  The value of `count'
  68 | can be arbitrarily large; in particular, if `count' is greater than 64, the
  69 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  70 | The result is stored in the location pointed to by `zPtr'.
  71 *----------------------------------------------------------------------------*/
  72
  73 INLINE void shift64RightJamming( uint64_t a, int16 count, uint64_t *zPtr )
  74 {
  75     uint64_t z;
  76
  77     if ( count == 0 ) {
  78         z = a;
  79     }
  80     else if ( count < 64 ) {
  81         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
  82     }
  83     else {
  84         z = ( a != 0 );
  85     }
  86     *zPtr = z;
  87
  88 }
  89
  90 /*----------------------------------------------------------------------------
  91 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  92 | _plus_ the number of bits given in `count'.  The shifted result is at most
  93 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
  94 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
  95 | shifted off is the most-significant bit of the extra result, and the other
  96 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  97 | bits shifted off were all zero.  This extra result is stored in the location
  98 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
  99 |     (This routine makes more sense if `a0' and `a1' are considered to form
 100 | a fixed-point value with binary point between `a0' and `a1'.  This fixed-
 101 | point value is shifted right by the number of bits given in `count', and
 102 | the integer part of the result is returned at the location pointed to by
 103 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 104 | described above, and is returned at the location pointed to by `z1Ptr'.)
 105 *----------------------------------------------------------------------------*/
 106
 107 INLINE void
 108  shift64ExtraRightJamming(
 109      uint64_t a0, uint64_t a1, int16 count, uint64_t *z0Ptr, uint64_t *z1Ptr )
 110 {
 111     uint64_t z0, z1;
 112     int8 negCount = ( - count ) & 63;
 113
 114     if ( count == 0 ) {
 115         z1 = a1;
 116         z0 = a0;
 117     }
 118     else if ( count < 64 ) {
 119         z1 = ( a0<<negCount ) | ( a1 != 0 );
 120         z0 = a0>>count;
 121     }
 122     else {
 123         if ( count == 64 ) {
 124             z1 = a0 | ( a1 != 0 );
 125         }
 126         else {
 127             z1 = ( ( a0 | a1 ) != 0 );
 128         }
 129         z0 = 0;
 130     }
 131     *z1Ptr = z1;
 132     *z0Ptr = z0;
 133
 134 }
 135
 136 /*----------------------------------------------------------------------------
 137 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 138 | number of bits given in `count'.  Any bits shifted off are lost.  The value
 139 | of `count' can be arbitrarily large; in particular, if `count' is greater
 140 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
 141 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 142 *----------------------------------------------------------------------------*/
 143
 144 INLINE void
 145  shift128Right(
 146      uint64_t a0, uint64_t a1, int16 count, uint64_t *z0Ptr, uint64_t *z1Ptr )
 147 {
 148     uint64_t z0, z1;
 149     int8 negCount = ( - count ) & 63;
 150
 151     if ( count == 0 ) {
 152         z1 = a1;
 153         z0 = a0;
 154     }
 155     else if ( count < 64 ) {
 156         z1 = ( a0<<negCount ) | ( a1>>count );
 157         z0 = a0>>count;
 158     }
 159     else {
 160         z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
 161         z0 = 0;
 162     }
 163     *z1Ptr = z1;
 164     *z0Ptr = z0;
 165
 166 }
 167
 168 /*----------------------------------------------------------------------------
 169 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
 170 | number of bits given in `count'.  If any nonzero bits are shifted off, they
 171 | are ``jammed'' into the least significant bit of the result by setting the
 172 | least significant bit to 1.  The value of `count' can be arbitrarily large;
 173 | in particular, if `count' is greater than 128, the result will be either
 174 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
 175 | nonzero.  The result is broken into two 64-bit pieces which are stored at
 176 | the locations pointed to by `z0Ptr' and `z1Ptr'.
 177 *----------------------------------------------------------------------------*/
 178
 179 INLINE void
 180  shift128RightJamming(
 181      uint64_t a0, uint64_t a1, int16 count, uint64_t *z0Ptr, uint64_t *z1Ptr )
 182 {
 183     uint64_t z0, z1;
 184     int8 negCount = ( - count ) & 63;
 185
 186     if ( count == 0 ) {
 187         z1 = a1;
 188         z0 = a0;
 189     }
 190     else if ( count < 64 ) {
 191         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
 192         z0 = a0>>count;
 193     }
 194     else {
 195         if ( count == 64 ) {
 196             z1 = a0 | ( a1 != 0 );
 197         }
 198         else if ( count < 128 ) {
 199             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
 200         }
 201         else {
 202             z1 = ( ( a0 | a1 ) != 0 );
 203         }
 204         z0 = 0;
 205     }
 206     *z1Ptr = z1;
 207     *z0Ptr = z0;
 208
 209 }
 210
 211 /*----------------------------------------------------------------------------
 212 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
 213 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
 214 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
 215 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
 216 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
 217 | the most-significant bit of the extra result, and the other 63 bits of the
 218 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
 219 | were all zero.  This extra result is stored in the location pointed to by
 220 | `z2Ptr'.  The value of `count' can be arbitrarily large.
 221 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
 222 | to form a fixed-point value with binary point between `a1' and `a2'.  This
 223 | fixed-point value is shifted right by the number of bits given in `count',
 224 | and the integer part of the result is returned at the locations pointed to
 225 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
 226 | corrupted as described above, and is returned at the location pointed to by
 227 | `z2Ptr'.)
 228 *----------------------------------------------------------------------------*/
 229
 230 INLINE void
 231  shift128ExtraRightJamming(
 232      uint64_t a0,
 233      uint64_t a1,
 234      uint64_t a2,
 235      int16 count,
 236      uint64_t *z0Ptr,
 237      uint64_t *z1Ptr,
 238      uint64_t *z2Ptr
 239  )
 240 {
 241     uint64_t z0, z1, z2;
 242     int8 negCount = ( - count ) & 63;
 243
 244     if ( count == 0 ) {
 245         z2 = a2;
 246         z1 = a1;
 247         z0 = a0;
 248     }
 249     else {
 250         if ( count < 64 ) {
 251             z2 = a1<<negCount;
 252             z1 = ( a0<<negCount ) | ( a1>>count );
 253             z0 = a0>>count;
 254         }
 255         else {
 256             if ( count == 64 ) {
 257                 z2 = a1;
 258                 z1 = a0;
 259             }
 260             else {
 261                 a2 |= a1;
 262                 if ( count < 128 ) {
 263                     z2 = a0<<negCount;
 264                     z1 = a0>>( count & 63 );
 265                 }
 266                 else {
 267                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
 268                     z1 = 0;
 269                 }
 270             }
 271             z0 = 0;
 272         }
 273         z2 |= ( a2 != 0 );
 274     }
 275     *z2Ptr = z2;
 276     *z1Ptr = z1;
 277     *z0Ptr = z0;
 278
 279 }
 280
 281 /*----------------------------------------------------------------------------
 282 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
 283 | number of bits given in `count'.  Any bits shifted off are lost.  The value
 284 | of `count' must be less than 64.  The result is broken into two 64-bit
 285 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 286 *----------------------------------------------------------------------------*/
 287
 288 INLINE void
 289  shortShift128Left(
 290      uint64_t a0, uint64_t a1, int16 count, uint64_t *z0Ptr, uint64_t *z1Ptr )
 291 {
 292
 293     *z1Ptr = a1<<count;
 294     *z0Ptr =
 295         ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
 296
 297 }
 298
 299 /*----------------------------------------------------------------------------
 300 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
 301 | by the number of bits given in `count'.  Any bits shifted off are lost.
 302 | The value of `count' must be less than 64.  The result is broken into three
 303 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 304 | `z1Ptr', and `z2Ptr'.
 305 *----------------------------------------------------------------------------*/
 306
 307 INLINE void
 308  shortShift192Left(
 309      uint64_t a0,
 310      uint64_t a1,
 311      uint64_t a2,
 312      int16 count,
 313      uint64_t *z0Ptr,
 314      uint64_t *z1Ptr,
 315      uint64_t *z2Ptr
 316  )
 317 {
 318     uint64_t z0, z1, z2;
 319     int8 negCount;
 320
 321     z2 = a2<<count;
 322     z1 = a1<<count;
 323     z0 = a0<<count;
 324     if ( 0 < count ) {
 325         negCount = ( ( - count ) & 63 );
 326         z1 |= a2>>negCount;
 327         z0 |= a1>>negCount;
 328     }
 329     *z2Ptr = z2;
 330     *z1Ptr = z1;
 331     *z0Ptr = z0;
 332
 333 }
 334
 335 /*----------------------------------------------------------------------------
 336 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
 337 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
 338 | any carry out is lost.  The result is broken into two 64-bit pieces which
 339 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 340 *----------------------------------------------------------------------------*/
 341
 342 INLINE void
 343  add128(
 344      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 345 {
 346     uint64_t z1;
 347
 348     z1 = a1 + b1;
 349     *z1Ptr = z1;
 350     *z0Ptr = a0 + b0 + ( z1 < a1 );
 351
 352 }
 353
 354 /*----------------------------------------------------------------------------
 355 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
 356 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
 357 | modulo 2^192, so any carry out is lost.  The result is broken into three
 358 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
 359 | `z1Ptr', and `z2Ptr'.
 360 *----------------------------------------------------------------------------*/
 361
 362 INLINE void
 363  add192(
 364      uint64_t a0,
 365      uint64_t a1,
 366      uint64_t a2,
 367      uint64_t b0,
 368      uint64_t b1,
 369      uint64_t b2,
 370      uint64_t *z0Ptr,
 371      uint64_t *z1Ptr,
 372      uint64_t *z2Ptr
 373  )
 374 {
 375     uint64_t z0, z1, z2;
 376     int8 carry0, carry1;
 377
 378     z2 = a2 + b2;
 379     carry1 = ( z2 < a2 );
 380     z1 = a1 + b1;
 381     carry0 = ( z1 < a1 );
 382     z0 = a0 + b0;
 383     z1 += carry1;
 384     z0 += ( z1 < carry1 );
 385     z0 += carry0;
 386     *z2Ptr = z2;
 387     *z1Ptr = z1;
 388     *z0Ptr = z0;
 389
 390 }
 391
 392 /*----------------------------------------------------------------------------
 393 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
 394 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
 395 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
 396 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
 397 | `z1Ptr'.
 398 *----------------------------------------------------------------------------*/
 399
 400 INLINE void
 401  sub128(
 402      uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1, uint64_t *z0Ptr, uint64_t *z1Ptr )
 403 {
 404
 405     *z1Ptr = a1 - b1;
 406     *z0Ptr = a0 - b0 - ( a1 < b1 );
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
 412 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
 413 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
 414 | result is broken into three 64-bit pieces which are stored at the locations
 415 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
 416 *----------------------------------------------------------------------------*/
 417
 418 INLINE void
 419  sub192(
 420      uint64_t a0,
 421      uint64_t a1,
 422      uint64_t a2,
 423      uint64_t b0,
 424      uint64_t b1,
 425      uint64_t b2,
 426      uint64_t *z0Ptr,
 427      uint64_t *z1Ptr,
 428      uint64_t *z2Ptr
 429  )
 430 {
 431     uint64_t z0, z1, z2;
 432     int8 borrow0, borrow1;
 433
 434     z2 = a2 - b2;
 435     borrow1 = ( a2 < b2 );
 436     z1 = a1 - b1;
 437     borrow0 = ( a1 < b1 );
 438     z0 = a0 - b0;
 439     z0 -= ( z1 < borrow1 );
 440     z1 -= borrow1;
 441     z0 -= borrow0;
 442     *z2Ptr = z2;
 443     *z1Ptr = z1;
 444     *z0Ptr = z0;
 445
 446 }
 447
 448 /*----------------------------------------------------------------------------
 449 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
 450 | into two 64-bit pieces which are stored at the locations pointed to by
 451 | `z0Ptr' and `z1Ptr'.
 452 *----------------------------------------------------------------------------*/
 453
 454 INLINE void mul64To128( uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr )
 455 {
 456     uint32_t aHigh, aLow, bHigh, bLow;
 457     uint64_t z0, zMiddleA, zMiddleB, z1;
 458
 459     aLow = a;
 460     aHigh = a>>32;
 461     bLow = b;
 462     bHigh = b>>32;
 463     z1 = ( (uint64_t) aLow ) * bLow;
 464     zMiddleA = ( (uint64_t) aLow ) * bHigh;
 465     zMiddleB = ( (uint64_t) aHigh ) * bLow;
 466     z0 = ( (uint64_t) aHigh ) * bHigh;
 467     zMiddleA += zMiddleB;
 468     z0 += ( ( (uint64_t) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
 469     zMiddleA <<= 32;
 470     z1 += zMiddleA;
 471     z0 += ( z1 < zMiddleA );
 472     *z1Ptr = z1;
 473     *z0Ptr = z0;
 474
 475 }
 476
 477 /*----------------------------------------------------------------------------
 478 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
 479 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
 480 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
 481 | `z2Ptr'.
 482 *----------------------------------------------------------------------------*/
 483
 484 INLINE void
 485  mul128By64To192(
 486      uint64_t a0,
 487      uint64_t a1,
 488      uint64_t b,
 489      uint64_t *z0Ptr,
 490      uint64_t *z1Ptr,
 491      uint64_t *z2Ptr
 492  )
 493 {
 494     uint64_t z0, z1, z2, more1;
 495
 496     mul64To128( a1, b, &z1, &z2 );
 497     mul64To128( a0, b, &z0, &more1 );
 498     add128( z0, more1, 0, z1, &z0, &z1 );
 499     *z2Ptr = z2;
 500     *z1Ptr = z1;
 501     *z0Ptr = z0;
 502
 503 }
 504
 505 /*----------------------------------------------------------------------------
 506 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
 507 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
 508 | product.  The product is broken into four 64-bit pieces which are stored at
 509 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
 510 *----------------------------------------------------------------------------*/
 511
 512 INLINE void
 513  mul128To256(
 514      uint64_t a0,
 515      uint64_t a1,
 516      uint64_t b0,
 517      uint64_t b1,
 518      uint64_t *z0Ptr,
 519      uint64_t *z1Ptr,
 520      uint64_t *z2Ptr,
 521      uint64_t *z3Ptr
 522  )
 523 {
 524     uint64_t z0, z1, z2, z3;
 525     uint64_t more1, more2;
 526
 527     mul64To128( a1, b1, &z2, &z3 );
 528     mul64To128( a1, b0, &z1, &more2 );
 529     add128( z1, more2, 0, z2, &z1, &z2 );
 530     mul64To128( a0, b0, &z0, &more1 );
 531     add128( z0, more1, 0, z1, &z0, &z1 );
 532     mul64To128( a0, b1, &more1, &more2 );
 533     add128( more1, more2, 0, z2, &more1, &z2 );
 534     add128( z0, z1, 0, more1, &z0, &z1 );
 535     *z3Ptr = z3;
 536     *z2Ptr = z2;
 537     *z1Ptr = z1;
 538     *z0Ptr = z0;
 539
 540 }
 541
 542 /*----------------------------------------------------------------------------
 543 | Returns an approximation to the 64-bit integer quotient obtained by dividing
 544 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
 545 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
 546 | toward zero, the approximation returned lies between q and q + 2 inclusive.
 547 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
 548 | unsigned integer is returned.
 549 *----------------------------------------------------------------------------*/
 550
 551 static uint64_t estimateDiv128To64( uint64_t a0, uint64_t a1, uint64_t b )
 552 {
 553     uint64_t b0, b1;
 554     uint64_t rem0, rem1, term0, term1;
 555     uint64_t z;
 556
 557     if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
 558     b0 = b>>32;
 559     z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
 560     mul64To128( b, z, &term0, &term1 );
 561     sub128( a0, a1, term0, term1, &rem0, &rem1 );
 562     while ( ( (int64_t) rem0 ) < 0 ) {
 563         z -= LIT64( 0x100000000 );
 564         b1 = b<<32;
 565         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
 566     }
 567     rem0 = ( rem0<<32 ) | ( rem1>>32 );
 568     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
 569     return z;
 570
 571 }
 572
 573 /*----------------------------------------------------------------------------
 574 | Returns an approximation to the square root of the 32-bit significand given
 575 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
 576 | `aExp' (the least significant bit) is 1, the integer returned approximates
 577 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
 578 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
 579 | case, the approximation returned lies strictly within +/-2 of the exact
 580 | value.
 581 *----------------------------------------------------------------------------*/
 582
 583 static uint32_t estimateSqrt32( int16 aExp, uint32_t a )
 584 {
 585     static const uint16_t sqrtOddAdjustments[] = {
 586         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
 587         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
 588     };
 589     static const uint16_t sqrtEvenAdjustments[] = {
 590         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
 591         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
 592     };
 593     int8 index;
 594     uint32_t z;
 595
 596     index = ( a>>27 ) & 15;
 597     if ( aExp & 1 ) {
 598         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
 599         z = ( ( a / z )<<14 ) + ( z<<15 );
 600         a >>= 1;
 601     }
 602     else {
 603         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
 604         z = a / z + z;
 605         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
 606         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
 607     }
 608     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
 609
 610 }
 611
 612 /*----------------------------------------------------------------------------
 613 | Returns the number of leading 0 bits before the most-significant 1 bit of
 614 | `a'.  If `a' is zero, 32 is returned.
 615 *----------------------------------------------------------------------------*/
 616
 617 static int8 countLeadingZeros32( uint32_t a )
 618 {
 619     static const int8 countLeadingZerosHigh[] = {
 620         8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
 621         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 622         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 623         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 624         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 625         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 626         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 627         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 628         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 629         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 630         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 631         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 632         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 633         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 634         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 635         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 636     };
 637     int8 shiftCount;
 638
 639     shiftCount = 0;
 640     if ( a < 0x10000 ) {
 641         shiftCount += 16;
 642         a <<= 16;
 643     }
 644     if ( a < 0x1000000 ) {
 645         shiftCount += 8;
 646         a <<= 8;
 647     }
 648     shiftCount += countLeadingZerosHigh[ a>>24 ];
 649     return shiftCount;
 650
 651 }
 652
 653 /*----------------------------------------------------------------------------
 654 | Returns the number of leading 0 bits before the most-significant 1 bit of
 655 | `a'.  If `a' is zero, 64 is returned.
 656 *----------------------------------------------------------------------------*/
 657
 658 static int8 countLeadingZeros64( uint64_t a )
 659 {
 660     int8 shiftCount;
 661
 662     shiftCount = 0;
 663     if ( a < ( (uint64_t) 1 )<<32 ) {
 664         shiftCount += 32;
 665     }
 666     else {
 667         a >>= 32;
 668     }
 669     shiftCount += countLeadingZeros32( a );
 670     return shiftCount;
 671
 672 }
 673
 674 /*----------------------------------------------------------------------------
 675 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
 676 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
 677 | Otherwise, returns 0.
 678 *----------------------------------------------------------------------------*/
 679
 680 INLINE flag eq128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 681 {
 682
 683     return ( a0 == b0 ) && ( a1 == b1 );
 684
 685 }
 686
 687 /*----------------------------------------------------------------------------
 688 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 689 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
 690 | Otherwise, returns 0.
 691 *----------------------------------------------------------------------------*/
 692
 693 INLINE flag le128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 694 {
 695
 696     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
 697
 698 }
 699
 700 /*----------------------------------------------------------------------------
 701 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
 702 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
 703 | returns 0.
 704 *----------------------------------------------------------------------------*/
 705
 706 INLINE flag lt128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 707 {
 708
 709     return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
 710
 711 }
 712
 713 /*----------------------------------------------------------------------------
 714 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
 715 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
 716 | Otherwise, returns 0.
 717 *----------------------------------------------------------------------------*/
 718
 719 INLINE flag ne128( uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1 )
 720 {
 721
 722     return ( a0 != b0 ) || ( a1 != b1 );
 723
 724 }