source/libs/gmp/gmp-src/mpn/x86/k7/mmx/divrem_1.asm

   1 dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
   2 dnl  division.
   3
   4 dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of either:
  10 dnl
  11 dnl    * the GNU Lesser General Public License as published by the Free
  12 dnl      Software Foundation; either version 3 of the License, or (at your
  13 dnl      option) any later version.
  14 dnl
  15 dnl  or
  16 dnl
  17 dnl    * the GNU General Public License as published by the Free Software
  18 dnl      Foundation; either version 2 of the License, or (at your option) any
  19 dnl      later version.
  20 dnl
  21 dnl  or both in parallel, as here.
  22 dnl
  23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  26 dnl  for more details.
  27 dnl
  28 dnl  You should have received copies of the GNU General Public License and the
  29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  30 dnl  see https://www.gnu.org/licenses/.
  31
  32 include(`../config.m4')
  33
  34
  35 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
  36
  37
  38 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  39 C                         mp_srcptr src, mp_size_t size,
  40 C                         mp_limb_t divisor);
  41 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  42 C                          mp_srcptr src, mp_size_t size,
  43 C                          mp_limb_t divisor, mp_limb_t carry);
  44 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  45 C                                mp_srcptr src, mp_size_t size,
  46 C                                mp_limb_t divisor, mp_limb_t inverse,
  47 C                                unsigned shift);
  48 C
  49 C Algorithm:
  50 C
  51 C The method and nomenclature follow part 8 of "Division by Invariant
  52 C Integers using Multiplication" by Granlund and Montgomery, reference in
  53 C gmp.texi.
  54 C
  55 C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
  56 C for m', and "d" for d_norm, which won't cause any confusion since it's
  57 C only the normalized divisor that's of any use in the code.  "b" is written
  58 C for 2^N, the size of a limb, N being 32 here.
  59 C
  60 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
  61 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
  62 C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
  63 C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
  64 C if q1==0xFFFFFFFF that must be the right value.
  65 C
  66 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
  67 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
  68 C then goes through as normal, and finding no addback required.  sbbl costs
  69 C an extra cycle over what the main loop code does, but it keeps code size
  70 C and complexity down.
  71 C
  72 C Notes:
  73 C
  74 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
  75 C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
  76 C carry, since in normal circumstances that will be a very rare event.
  77 C
  78 C The test for skipping a division is branch free (once size>=1 is tested).
  79 C The store to the destination high limb is 0 when a divide is skipped, or
  80 C if it's not skipped then a copy of the src high limb is used.  The latter
  81 C is in case src==dst.
  82 C
  83 C There's a small bias towards expecting xsize==0, by having code for
  84 C xsize==0 in a straight line and xsize!=0 under forward jumps.
  85 C
  86 C Alternatives:
  87 C
  88 C If the divisor is normalized (high bit set) then a division step can
  89 C always be skipped, since the high destination limb is always 0 or 1 in
  90 C that case.  It doesn't seem worth checking for this though, since it
  91 C probably occurs infrequently, in particular note that big_base for a
  92 C decimal mpn_get_str is not normalized in a 32-bit limb.
  93
  94
  95 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  96 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  97 dnl
  98 dnl  The inverse takes about 50 cycles to calculate, but after that the
  99 dnl  multiply is 17 c/l versus division at 42 c/l.
 100 dnl
 101 dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
 102 dnl  even more so on the fractional part.
 103
 104 deflit(MUL_THRESHOLD, 3)
 105
 106
 107 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
 108 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
 109 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
 110 defframe(PARAM_DIVISOR,20)
 111 defframe(PARAM_SIZE,   16)
 112 defframe(PARAM_SRC,    12)
 113 defframe(PARAM_XSIZE,  8)
 114 defframe(PARAM_DST,    4)
 115
 116 defframe(SAVE_EBX,    -4)
 117 defframe(SAVE_ESI,    -8)
 118 defframe(SAVE_EDI,    -12)
 119 defframe(SAVE_EBP,    -16)
 120
 121 defframe(VAR_NORM,    -20)
 122 defframe(VAR_INVERSE, -24)
 123 defframe(VAR_SRC,     -28)
 124 defframe(VAR_DST,     -32)
 125 defframe(VAR_DST_STOP,-36)
 126
 127 deflit(STACK_SPACE, 36)
 128
 129         TEXT
 130         ALIGN(32)
 131
 132 PROLOGUE(mpn_preinv_divrem_1)
 133 deflit(`FRAME',0)
 134         movl    PARAM_XSIZE, %ecx
 135         movl    PARAM_DST, %edx
 136         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
 137
 138         movl    %esi, SAVE_ESI
 139         movl    PARAM_SRC, %esi
 140
 141         movl    %ebx, SAVE_EBX
 142         movl    PARAM_SIZE, %ebx
 143
 144         leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
 145         movl    %ebp, SAVE_EBP
 146         movl    PARAM_DIVISOR, %ebp
 147
 148         movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
 149         movl    %edi, SAVE_EDI
 150         xorl    %edi, %edi              C carry
 151
 152         movl    -4(%esi,%ebx,4), %eax   C src high limb
 153         xor     %ecx, %ecx
 154
 155         C
 156
 157         C
 158
 159         cmpl    %ebp, %eax              C high cmp divisor
 160
 161         cmovc(  %eax, %edi)             C high is carry if high<divisor
 162         cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
 163                                         C (the latter in case src==dst)
 164
 165         movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
 166         sbbl    $0, %ebx                C skip one division if high<divisor
 167         movl    PARAM_PREINV_SHIFT, %ecx
 168
 169         leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
 170         movl    $32, %eax
 171
 172         movl    %edx, VAR_DST           C &dst[xsize+size]
 173
 174         shll    %cl, %ebp               C d normalized
 175         subl    %ecx, %eax
 176         movl    %ecx, VAR_NORM
 177
 178         movd    %eax, %mm7              C rshift
 179         movl    PARAM_PREINV_INVERSE, %eax
 180         jmp     L(start_preinv)
 181
 182 EPILOGUE()
 183
 184
 185         ALIGN(16)
 186
 187 PROLOGUE(mpn_divrem_1c)
 188 deflit(`FRAME',0)
 189         movl    PARAM_CARRY, %edx
 190         movl    PARAM_SIZE, %ecx
 191         subl    $STACK_SPACE, %esp
 192 deflit(`FRAME',STACK_SPACE)
 193
 194         movl    %ebx, SAVE_EBX
 195         movl    PARAM_XSIZE, %ebx
 196
 197         movl    %edi, SAVE_EDI
 198         movl    PARAM_DST, %edi
 199
 200         movl    %ebp, SAVE_EBP
 201         movl    PARAM_DIVISOR, %ebp
 202
 203         movl    %esi, SAVE_ESI
 204         movl    PARAM_SRC, %esi
 205
 206         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 207         jmp     L(start_1c)
 208
 209 EPILOGUE()
 210
 211
 212         C offset 0xa1, close enough to aligned
 213 PROLOGUE(mpn_divrem_1)
 214 deflit(`FRAME',0)
 215
 216         movl    PARAM_SIZE, %ecx
 217         movl    $0, %edx                C initial carry (if can't skip a div)
 218         subl    $STACK_SPACE, %esp
 219 deflit(`FRAME',STACK_SPACE)
 220
 221         movl    %esi, SAVE_ESI
 222         movl    PARAM_SRC, %esi
 223
 224         movl    %ebx, SAVE_EBX
 225         movl    PARAM_XSIZE, %ebx
 226
 227         movl    %ebp, SAVE_EBP
 228         movl    PARAM_DIVISOR, %ebp
 229         orl     %ecx, %ecx              C size
 230
 231         movl    %edi, SAVE_EDI
 232         movl    PARAM_DST, %edi
 233         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 234
 235         jz      L(no_skip_div)          C if size==0
 236         movl    -4(%esi,%ecx,4), %eax   C src high limb
 237         xorl    %esi, %esi
 238
 239         cmpl    %ebp, %eax              C high cmp divisor
 240
 241         cmovc(  %eax, %edx)             C high is carry if high<divisor
 242         cmovnc( %eax, %esi)             C 0 if skip div, src high if not
 243
 244         movl    %esi, (%edi,%ecx,4)     C dst high limb
 245         sbbl    $0, %ecx                C size-1 if high<divisor
 246         movl    PARAM_SRC, %esi         C reload
 247 L(no_skip_div):
 248
 249
 250 L(start_1c):
 251         C eax
 252         C ebx   xsize
 253         C ecx   size
 254         C edx   carry
 255         C esi   src
 256         C edi   &dst[xsize-1]
 257         C ebp   divisor
 258
 259         leal    (%ebx,%ecx), %eax       C size+xsize
 260         cmpl    $MUL_THRESHOLD, %eax
 261         jae     L(mul_by_inverse)
 262
 263
 264 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
 265 C It'd be possible to write them out without the looping, but no speedup
 266 C would be expected.
 267 C
 268 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
 269 C integer part, but curiously not on the fractional part, where %ebp is a
 270 C (fixed) couple of cycles faster.
 271
 272         orl     %ecx, %ecx
 273         jz      L(divide_no_integer)
 274
 275 L(divide_integer):
 276         C eax   scratch (quotient)
 277         C ebx   xsize
 278         C ecx   counter
 279         C edx   scratch (remainder)
 280         C esi   src
 281         C edi   &dst[xsize-1]
 282         C ebp   divisor
 283
 284         movl    -4(%esi,%ecx,4), %eax
 285
 286         divl    PARAM_DIVISOR
 287
 288         movl    %eax, (%edi,%ecx,4)
 289         decl    %ecx
 290         jnz     L(divide_integer)
 291
 292
 293 L(divide_no_integer):
 294         movl    PARAM_DST, %edi
 295         orl     %ebx, %ebx
 296         jnz     L(divide_fraction)
 297
 298 L(divide_done):
 299         movl    SAVE_ESI, %esi
 300         movl    SAVE_EDI, %edi
 301         movl    %edx, %eax
 302
 303         movl    SAVE_EBX, %ebx
 304         movl    SAVE_EBP, %ebp
 305         addl    $STACK_SPACE, %esp
 306
 307         ret
 308
 309
 310 L(divide_fraction):
 311         C eax   scratch (quotient)
 312         C ebx   counter
 313         C ecx
 314         C edx   scratch (remainder)
 315         C esi
 316         C edi   dst
 317         C ebp   divisor
 318
 319         movl    $0, %eax
 320
 321         divl    %ebp
 322
 323         movl    %eax, -4(%edi,%ebx,4)
 324         decl    %ebx
 325         jnz     L(divide_fraction)
 326
 327         jmp     L(divide_done)
 328
 329
 330
 331 C -----------------------------------------------------------------------------
 332
 333 L(mul_by_inverse):
 334         C eax
 335         C ebx   xsize
 336         C ecx   size
 337         C edx   carry
 338         C esi   src
 339         C edi   &dst[xsize-1]
 340         C ebp   divisor
 341
 342         bsrl    %ebp, %eax              C 31-l
 343
 344         leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
 345         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 346
 347         movl    %edi, VAR_DST
 348         movl    %ebx, VAR_DST_STOP
 349
 350         movl    %ecx, %ebx              C size
 351         movl    $31, %ecx
 352
 353         movl    %edx, %edi              C carry
 354         movl    $-1, %edx
 355
 356         C
 357
 358         xorl    %eax, %ecx              C l
 359         incl    %eax                    C 32-l
 360
 361         shll    %cl, %ebp               C d normalized
 362         movl    %ecx, VAR_NORM
 363
 364         movd    %eax, %mm7
 365
 366         movl    $-1, %eax
 367         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 368
 369         divl    %ebp                    C floor (b*(b-d)-1) / d
 370
 371 L(start_preinv):
 372         C eax   inverse
 373         C ebx   size
 374         C ecx   shift
 375         C edx
 376         C esi   src
 377         C edi   carry
 378         C ebp   divisor
 379         C
 380         C mm7   rshift
 381
 382         orl     %ebx, %ebx              C size
 383         movl    %eax, VAR_INVERSE
 384         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 385
 386         jz      L(start_zero)
 387         movl    %eax, VAR_SRC
 388         cmpl    $1, %ebx
 389
 390         movl    8(%eax), %esi           C src high limb
 391         jz      L(start_one)
 392
 393 L(start_two_or_more):
 394         movl    4(%eax), %edx           C src second highest limb
 395
 396         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 397
 398         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 399
 400         cmpl    $2, %ebx
 401         je      L(integer_two_left)
 402         jmp     L(integer_top)
 403
 404
 405 L(start_one):
 406         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 407
 408         shll    %cl, %esi               C n10 = high << l
 409         movl    %eax, VAR_SRC
 410         jmp     L(integer_one_left)
 411
 412
 413 L(start_zero):
 414         C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
 415         C skipped a division.
 416
 417         shll    %cl, %edi               C n2 = carry << l
 418         movl    %edi, %eax              C return value for zero_done
 419         cmpl    $0, PARAM_XSIZE
 420
 421         je      L(zero_done)
 422         jmp     L(fraction_some)
 423
 424
 425
 426 C -----------------------------------------------------------------------------
 427 C
 428 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
 429 C execution.  The instruction scheduling is important, with various
 430 C apparently equivalent forms running 1 to 5 cycles slower.
 431 C
 432 C A lower bound for the time would seem to be 16 cycles, based on the
 433 C following successive dependencies.
 434 C
 435 C                     cycles
 436 C               n2+n1   1
 437 C               mul     6
 438 C               q1+1    1
 439 C               mul     6
 440 C               sub     1
 441 C               addback 1
 442 C                      ---
 443 C                      16
 444 C
 445 C This chain is what the loop has already, but 16 cycles isn't achieved.
 446 C K7 has enough decode, and probably enough execute (depending maybe on what
 447 C a mul actually consumes), but nothing running under 17 has been found.
 448 C
 449 C In theory n2+n1 could be done in the sub and addback stages (by
 450 C calculating both n2 and n2+n1 there), but lack of registers makes this an
 451 C unlikely proposition.
 452 C
 453 C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
 454 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
 455 C chain, and nothing better than 18 cycles has been found when using it.
 456 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
 457 C be an extremely rare event.
 458 C
 459 C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
 460 C if some special data is coming out with this always, the q1_ff special
 461 C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
 462 C induce the q1_ff case, for speed measurements or testing.  Note that
 463 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
 464 C
 465 C The instruction groupings and empty comments show the cycles for a naive
 466 C in-order view of the code (conveniently ignoring the load latency on
 467 C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
 468 C to the extent that out-of-order execution rearranges it.  In this case
 469 C there's 19 cycles shown, but it executes at 17.
 470
 471         ALIGN(16)
 472 L(integer_top):
 473         C eax   scratch
 474         C ebx   scratch (nadj, q1)
 475         C ecx   scratch (src, dst)
 476         C edx   scratch
 477         C esi   n10
 478         C edi   n2
 479         C ebp   divisor
 480         C
 481         C mm0   scratch (src qword)
 482         C mm7   rshift for normalization
 483
 484         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 485         movl    %edi, %eax         C n2
 486         movl    VAR_SRC, %ecx
 487
 488         leal    (%ebp,%esi), %ebx
 489         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 490         sbbl    $-1, %eax          C n2+n1
 491
 492         mull    VAR_INVERSE        C m*(n2+n1)
 493
 494         movq    (%ecx), %mm0       C next limb and the one below it
 495         subl    $4, %ecx
 496
 497         movl    %ecx, VAR_SRC
 498
 499         C
 500
 501         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 502         leal    1(%edi), %ebx      C n2+1
 503         movl    %ebp, %eax         C d
 504
 505         C
 506
 507         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 508         jz      L(q1_ff)
 509         movl    VAR_DST, %ecx
 510
 511         mull    %ebx               C (q1+1)*d
 512
 513         psrlq   %mm7, %mm0
 514
 515         leal    -4(%ecx), %ecx
 516
 517         C
 518
 519         subl    %eax, %esi
 520         movl    VAR_DST_STOP, %eax
 521
 522         C
 523
 524         sbbl    %edx, %edi         C n - (q1+1)*d
 525         movl    %esi, %edi         C remainder -> n2
 526         leal    (%ebp,%esi), %edx
 527
 528         movd    %mm0, %esi
 529
 530         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 531         sbbl    $0, %ebx           C q
 532         cmpl    %eax, %ecx
 533
 534         movl    %ebx, (%ecx)
 535         movl    %ecx, VAR_DST
 536         jne     L(integer_top)
 537
 538
 539 L(integer_loop_done):
 540
 541
 542 C -----------------------------------------------------------------------------
 543 C
 544 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 545 C q1_ff special case.  This make the code a bit smaller and simpler, and
 546 C costs only 1 cycle (each).
 547
 548 L(integer_two_left):
 549         C eax   scratch
 550         C ebx   scratch (nadj, q1)
 551         C ecx   scratch (src, dst)
 552         C edx   scratch
 553         C esi   n10
 554         C edi   n2
 555         C ebp   divisor
 556         C
 557         C mm7   rshift
 558
 559         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 560         movl    %edi, %eax         C n2
 561         movl    PARAM_SRC, %ecx
 562
 563         leal    (%ebp,%esi), %ebx
 564         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 565         sbbl    $-1, %eax          C n2+n1
 566
 567         mull    VAR_INVERSE        C m*(n2+n1)
 568
 569         movd    (%ecx), %mm0       C src low limb
 570
 571         movl    VAR_DST_STOP, %ecx
 572
 573         C
 574
 575         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 576         leal    1(%edi), %ebx      C n2+1
 577         movl    %ebp, %eax         C d
 578
 579         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 580
 581         sbbl    $0, %ebx
 582
 583         mull    %ebx               C (q1+1)*d
 584
 585         psllq   $32, %mm0
 586
 587         psrlq   %mm7, %mm0
 588
 589         C
 590
 591         subl    %eax, %esi
 592
 593         C
 594
 595         sbbl    %edx, %edi         C n - (q1+1)*d
 596         movl    %esi, %edi         C remainder -> n2
 597         leal    (%ebp,%esi), %edx
 598
 599         movd    %mm0, %esi
 600
 601         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 602         sbbl    $0, %ebx           C q
 603
 604         movl    %ebx, -4(%ecx)
 605
 606
 607 C -----------------------------------------------------------------------------
 608 L(integer_one_left):
 609         C eax   scratch
 610         C ebx   scratch (nadj, q1)
 611         C ecx   dst
 612         C edx   scratch
 613         C esi   n10
 614         C edi   n2
 615         C ebp   divisor
 616         C
 617         C mm7   rshift
 618
 619         movl    VAR_DST_STOP, %ecx
 620         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 621         movl    %edi, %eax         C n2
 622
 623         leal    (%ebp,%esi), %ebx
 624         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 625         sbbl    $-1, %eax          C n2+n1
 626
 627         mull    VAR_INVERSE        C m*(n2+n1)
 628
 629         C
 630
 631         C
 632
 633         C
 634
 635         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 636         leal    1(%edi), %ebx      C n2+1
 637         movl    %ebp, %eax         C d
 638
 639         C
 640
 641         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 642
 643         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 644
 645         mull    %ebx
 646
 647         C
 648
 649         C
 650
 651         C
 652
 653         subl    %eax, %esi
 654
 655         C
 656
 657         sbbl    %edx, %edi         C n - (q1+1)*d
 658         movl    %esi, %edi         C remainder -> n2
 659         leal    (%ebp,%esi), %edx
 660
 661         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 662         sbbl    $0, %ebx           C q
 663
 664         movl    %ebx, -8(%ecx)
 665         subl    $8, %ecx
 666
 667
 668
 669 L(integer_none):
 670         cmpl    $0, PARAM_XSIZE
 671         jne     L(fraction_some)
 672
 673         movl    %edi, %eax
 674 L(fraction_done):
 675         movl    VAR_NORM, %ecx
 676 L(zero_done):
 677         movl    SAVE_EBP, %ebp
 678
 679         movl    SAVE_EDI, %edi
 680         movl    SAVE_ESI, %esi
 681
 682         movl    SAVE_EBX, %ebx
 683         addl    $STACK_SPACE, %esp
 684
 685         shrl    %cl, %eax
 686         emms
 687
 688         ret
 689
 690
 691 C -----------------------------------------------------------------------------
 692 C
 693 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 694 C of q*d is simply -d and the remainder n-q*d = n10+d
 695
 696 L(q1_ff):
 697         C eax   (divisor)
 698         C ebx   (q1+1 == 0)
 699         C ecx
 700         C edx
 701         C esi   n10
 702         C edi   n2
 703         C ebp   divisor
 704
 705         movl    VAR_DST, %ecx
 706         movl    VAR_DST_STOP, %edx
 707         subl    $4, %ecx
 708
 709         psrlq   %mm7, %mm0
 710         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 711         movl    %ecx, VAR_DST
 712
 713         movd    %mm0, %esi              C next n10
 714
 715         movl    $-1, (%ecx)
 716         cmpl    %ecx, %edx
 717         jne     L(integer_top)
 718
 719         jmp     L(integer_loop_done)
 720
 721
 722
 723 C -----------------------------------------------------------------------------
 724 C
 725 C Being the fractional part, the "source" limbs are all zero, meaning
 726 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
 727 C
 728 C The loop runs at 15 cycles.  The dependent chain is the same as the
 729 C general case above, but without the n2+n1 stage (due to n1==0), so 15
 730 C would seem to be the lower bound.
 731 C
 732 C A not entirely obvious simplification is that q1+1 never overflows a limb,
 733 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
 734 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
 735 C rnd() means rounding down to a multiple of d.
 736 C
 737 C       m*n2 + b*n2 <= m*(d-1) + b*(d-1)
 738 C                    = m*d + b*d - m - b
 739 C                    = floor((b(b-d)-1)/d)*d + b*d - m - b
 740 C                    = rnd(b(b-d)-1) + b*d - m - b
 741 C                    = rnd(b(b-d)-1 + b*d) - m - b
 742 C                    = rnd(b*b-1) - m - b
 743 C                    <= (b-2)*b
 744 C
 745 C Unchanged from the general case is that the final quotient limb q can be
 746 C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
 747 C equation 8.4 of the paper which simplifies as follows when n1==0 and
 748 C n0==0.
 749 C
 750 C       n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
 751 C
 752 C As before, the instruction groupings and empty comments show a naive
 753 C in-order view of the code, which is made a nonsense by out of order
 754 C execution.  There's 17 cycles shown, but it executes at 15.
 755 C
 756 C Rotating the store q and remainder->n2 instructions up to the top of the
 757 C loop gets the run time down from 16 to 15.
 758
 759         ALIGN(16)
 760 L(fraction_some):
 761         C eax
 762         C ebx
 763         C ecx
 764         C edx
 765         C esi
 766         C edi   carry
 767         C ebp   divisor
 768
 769         movl    PARAM_DST, %esi
 770         movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
 771         movl    %edi, %eax
 772
 773         subl    $8, %ecx                C &dst[xsize]
 774         jmp     L(fraction_entry)
 775
 776
 777         ALIGN(16)
 778 L(fraction_top):
 779         C eax   n2 carry, then scratch
 780         C ebx   scratch (nadj, q1)
 781         C ecx   dst, decrementing
 782         C edx   scratch
 783         C esi   dst stop point
 784         C edi   (will be n2)
 785         C ebp   divisor
 786
 787         movl    %ebx, (%ecx)    C previous q
 788         movl    %eax, %edi      C remainder->n2
 789
 790 L(fraction_entry):
 791         mull    VAR_INVERSE     C m*n2
 792
 793         movl    %ebp, %eax      C d
 794         subl    $4, %ecx        C dst
 795         leal    1(%edi), %ebx
 796
 797         C
 798
 799         C
 800
 801         C
 802
 803         C
 804
 805         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 806
 807         mull    %ebx            C (q1+1)*d
 808
 809         C
 810
 811         C
 812
 813         C
 814
 815         negl    %eax            C low of n - (q1+1)*d
 816
 817         C
 818
 819         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 820         leal    (%ebp,%eax), %edx
 821
 822         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 823         sbbl    $0, %ebx        C q
 824         cmpl    %esi, %ecx
 825
 826         jne     L(fraction_top)
 827
 828
 829         movl    %ebx, (%ecx)
 830         jmp     L(fraction_done)
 831
 832 EPILOGUE()