source/libs/gmp/gmp-src/mpn/x86/p6/mmx/divrem_1.asm

   1 dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
   2
   3 dnl  Copyright 1999-2002 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
  35
  36
  37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  38 C                         mp_srcptr src, mp_size_t size,
  39 C                         mp_limb_t divisor);
  40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  41 C                          mp_srcptr src, mp_size_t size,
  42 C                          mp_limb_t divisor, mp_limb_t carry);
  43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  44 C                                mp_srcptr src, mp_size_t size,
  45 C                                mp_limb_t divisor, mp_limb_t inverse,
  46 C                                unsigned shift);
  47 C
  48 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
  49 C see that file for some comments.  It's possible what's here can be improved.
  50
  51
  52 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  53 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  54 dnl
  55 dnl  The different speeds of the integer and fraction parts means that using
  56 dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
  57 dnl  for the integer part and a bit lower for the fraction part.  (Or what's
  58 dnl  really wanted is to speed up the integer part!)
  59 dnl
  60 dnl  The threshold is set to make the integer part right.  At 4 limbs the
  61 dnl  div and mul are about the same there, but on the fractional part the
  62 dnl  mul is much faster.
  63
  64 deflit(MUL_THRESHOLD, 4)
  65
  66
  67 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
  68 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
  69 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
  70 defframe(PARAM_DIVISOR,20)
  71 defframe(PARAM_SIZE,   16)
  72 defframe(PARAM_SRC,    12)
  73 defframe(PARAM_XSIZE,  8)
  74 defframe(PARAM_DST,    4)
  75
  76 defframe(SAVE_EBX,    -4)
  77 defframe(SAVE_ESI,    -8)
  78 defframe(SAVE_EDI,    -12)
  79 defframe(SAVE_EBP,    -16)
  80
  81 defframe(VAR_NORM,    -20)
  82 defframe(VAR_INVERSE, -24)
  83 defframe(VAR_SRC,     -28)
  84 defframe(VAR_DST,     -32)
  85 defframe(VAR_DST_STOP,-36)
  86
  87 deflit(STACK_SPACE, 36)
  88
  89         TEXT
  90         ALIGN(16)
  91
  92 PROLOGUE(mpn_preinv_divrem_1)
  93 deflit(`FRAME',0)
  94         movl    PARAM_XSIZE, %ecx
  95         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  96
  97         movl    %esi, SAVE_ESI
  98         movl    PARAM_SRC, %esi
  99
 100         movl    %ebx, SAVE_EBX
 101         movl    PARAM_SIZE, %ebx
 102
 103         movl    %ebp, SAVE_EBP
 104         movl    PARAM_DIVISOR, %ebp
 105
 106         movl    %edi, SAVE_EDI
 107         movl    PARAM_DST, %edx
 108
 109         movl    -4(%esi,%ebx,4), %eax   C src high limb
 110         xorl    %edi, %edi              C initial carry (if can't skip a div)
 111
 112         C
 113
 114         leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
 115         xor     %ecx, %ecx
 116
 117         movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
 118         cmpl    %ebp, %eax              C high cmp divisor
 119
 120         cmovc(  %eax, %edi)             C high is carry if high<divisor
 121
 122         cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
 123                                         C (the latter in case src==dst)
 124
 125         movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
 126
 127         sbbl    $0, %ebx                C skip one division if high<divisor
 128         movl    PARAM_PREINV_SHIFT, %ecx
 129
 130         leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
 131         movl    $32, %eax
 132
 133         movl    %edx, VAR_DST           C &dst[xsize+size]
 134
 135         shll    %cl, %ebp               C d normalized
 136         subl    %ecx, %eax
 137         movl    %ecx, VAR_NORM
 138
 139         movd    %eax, %mm7              C rshift
 140         movl    PARAM_PREINV_INVERSE, %eax
 141         jmp     L(start_preinv)
 142
 143 EPILOGUE()
 144
 145
 146
 147         ALIGN(16)
 148
 149 PROLOGUE(mpn_divrem_1c)
 150 deflit(`FRAME',0)
 151         movl    PARAM_CARRY, %edx
 152
 153         movl    PARAM_SIZE, %ecx
 154         subl    $STACK_SPACE, %esp
 155 deflit(`FRAME',STACK_SPACE)
 156
 157         movl    %ebx, SAVE_EBX
 158         movl    PARAM_XSIZE, %ebx
 159
 160         movl    %edi, SAVE_EDI
 161         movl    PARAM_DST, %edi
 162
 163         movl    %ebp, SAVE_EBP
 164         movl    PARAM_DIVISOR, %ebp
 165
 166         movl    %esi, SAVE_ESI
 167         movl    PARAM_SRC, %esi
 168
 169         leal    -4(%edi,%ebx,4), %edi
 170         jmp     L(start_1c)
 171
 172 EPILOGUE()
 173
 174
 175         C offset 0x31, close enough to aligned
 176 PROLOGUE(mpn_divrem_1)
 177 deflit(`FRAME',0)
 178
 179         movl    PARAM_SIZE, %ecx
 180         movl    $0, %edx                C initial carry (if can't skip a div)
 181         subl    $STACK_SPACE, %esp
 182 deflit(`FRAME',STACK_SPACE)
 183
 184         movl    %ebp, SAVE_EBP
 185         movl    PARAM_DIVISOR, %ebp
 186
 187         movl    %ebx, SAVE_EBX
 188         movl    PARAM_XSIZE, %ebx
 189
 190         movl    %esi, SAVE_ESI
 191         movl    PARAM_SRC, %esi
 192         orl     %ecx, %ecx              C size
 193
 194         movl    %edi, SAVE_EDI
 195         movl    PARAM_DST, %edi
 196
 197         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 198         jz      L(no_skip_div)          C if size==0
 199
 200         movl    -4(%esi,%ecx,4), %eax   C src high limb
 201         xorl    %esi, %esi
 202         cmpl    %ebp, %eax              C high cmp divisor
 203
 204         cmovc(  %eax, %edx)             C high is carry if high<divisor
 205
 206         cmovnc( %eax, %esi)             C 0 if skip div, src high if not
 207                                         C (the latter in case src==dst)
 208
 209         movl    %esi, (%edi,%ecx,4)     C dst high limb
 210
 211         sbbl    $0, %ecx                C size-1 if high<divisor
 212         movl    PARAM_SRC, %esi         C reload
 213 L(no_skip_div):
 214
 215
 216 L(start_1c):
 217         C eax
 218         C ebx   xsize
 219         C ecx   size
 220         C edx   carry
 221         C esi   src
 222         C edi   &dst[xsize-1]
 223         C ebp   divisor
 224
 225         leal    (%ebx,%ecx), %eax       C size+xsize
 226         cmpl    $MUL_THRESHOLD, %eax
 227         jae     L(mul_by_inverse)
 228
 229         orl     %ecx, %ecx
 230         jz      L(divide_no_integer)
 231
 232 L(divide_integer):
 233         C eax   scratch (quotient)
 234         C ebx   xsize
 235         C ecx   counter
 236         C edx   scratch (remainder)
 237         C esi   src
 238         C edi   &dst[xsize-1]
 239         C ebp   divisor
 240
 241         movl    -4(%esi,%ecx,4), %eax
 242
 243         divl    %ebp
 244
 245         movl    %eax, (%edi,%ecx,4)
 246         decl    %ecx
 247         jnz     L(divide_integer)
 248
 249
 250 L(divide_no_integer):
 251         movl    PARAM_DST, %edi
 252         orl     %ebx, %ebx
 253         jnz     L(divide_fraction)
 254
 255 L(divide_done):
 256         movl    SAVE_ESI, %esi
 257
 258         movl    SAVE_EDI, %edi
 259
 260         movl    SAVE_EBX, %ebx
 261         movl    %edx, %eax
 262
 263         movl    SAVE_EBP, %ebp
 264         addl    $STACK_SPACE, %esp
 265
 266         ret
 267
 268
 269 L(divide_fraction):
 270         C eax   scratch (quotient)
 271         C ebx   counter
 272         C ecx
 273         C edx   scratch (remainder)
 274         C esi
 275         C edi   dst
 276         C ebp   divisor
 277
 278         movl    $0, %eax
 279
 280         divl    %ebp
 281
 282         movl    %eax, -4(%edi,%ebx,4)
 283         decl    %ebx
 284         jnz     L(divide_fraction)
 285
 286         jmp     L(divide_done)
 287
 288
 289
 290 C -----------------------------------------------------------------------------
 291
 292 L(mul_by_inverse):
 293         C eax
 294         C ebx   xsize
 295         C ecx   size
 296         C edx   carry
 297         C esi   src
 298         C edi   &dst[xsize-1]
 299         C ebp   divisor
 300
 301         leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
 302
 303         movl    %ebx, VAR_DST_STOP
 304         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 305
 306         movl    %edi, VAR_DST
 307         movl    %ecx, %ebx              C size
 308
 309         bsrl    %ebp, %ecx              C 31-l
 310         movl    %edx, %edi              C carry
 311
 312         leal    1(%ecx), %eax           C 32-l
 313         xorl    $31, %ecx               C l
 314
 315         movl    %ecx, VAR_NORM
 316         movl    $-1, %edx
 317
 318         shll    %cl, %ebp               C d normalized
 319         movd    %eax, %mm7
 320
 321         movl    $-1, %eax
 322         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 323
 324         divl    %ebp                    C floor (b*(b-d)-1) / d
 325
 326 L(start_preinv):
 327         C eax   inverse
 328         C ebx   size
 329         C ecx   shift
 330         C edx
 331         C esi   src
 332         C edi   carry
 333         C ebp   divisor
 334         C
 335         C mm7   rshift
 336
 337         movl    %eax, VAR_INVERSE
 338         orl     %ebx, %ebx              C size
 339         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 340
 341         movl    %eax, VAR_SRC
 342         jz      L(start_zero)
 343
 344         movl    8(%eax), %esi           C src high limb
 345         cmpl    $1, %ebx
 346         jz      L(start_one)
 347
 348 L(start_two_or_more):
 349         movl    4(%eax), %edx           C src second highest limb
 350
 351         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 352
 353         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 354
 355         cmpl    $2, %ebx
 356         je      L(integer_two_left)
 357         jmp     L(integer_top)
 358
 359
 360 L(start_one):
 361         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 362
 363         shll    %cl, %esi               C n10 = high << l
 364         jmp     L(integer_one_left)
 365
 366
 367 L(start_zero):
 368         C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
 369         C skipped a division.
 370
 371         shll    %cl, %edi               C n2 = carry << l
 372         movl    %edi, %eax              C return value for zero_done
 373         cmpl    $0, PARAM_XSIZE
 374
 375         je      L(zero_done)
 376         jmp     L(fraction_some)
 377
 378
 379
 380 C -----------------------------------------------------------------------------
 381 C
 382 C This loop runs at about 25 cycles, which is probably sub-optimal, and
 383 C certainly more than the dependent chain would suggest.  A better loop, or
 384 C a better rough analysis of what's possible, would be welcomed.
 385 C
 386 C In the current implementation, the following successively dependent
 387 C micro-ops seem to exist.
 388 C
 389 C                      uops
 390 C               n2+n1   1   (addl)
 391 C               mul     5
 392 C               q1+1    3   (addl/adcl)
 393 C               mul     5
 394 C               sub     3   (subl/sbbl)
 395 C               addback 2   (cmov)
 396 C                      ---
 397 C                      19
 398 C
 399 C Lack of registers hinders explicit scheduling and it might be that the
 400 C normal out of order execution isn't able to hide enough under the mul
 401 C latencies.
 402 C
 403 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
 404 C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
 405 C combination was tried for the addback (despite the fact it would lengthen
 406 C the dependent chain) but found to be no faster.
 407
 408
 409         ALIGN(16)
 410 L(integer_top):
 411         C eax   scratch
 412         C ebx   scratch (nadj, q1)
 413         C ecx   scratch (src, dst)
 414         C edx   scratch
 415         C esi   n10
 416         C edi   n2
 417         C ebp   d
 418         C
 419         C mm0   scratch (src qword)
 420         C mm7   rshift for normalization
 421
 422         movl    %esi, %eax
 423         movl    %ebp, %ebx
 424
 425         sarl    $31, %eax          C -n1
 426         movl    VAR_SRC, %ecx
 427
 428         andl    %eax, %ebx         C -n1 & d
 429         negl    %eax               C n1
 430
 431         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 432         addl    %edi, %eax         C n2+n1
 433         movq    (%ecx), %mm0       C next src limb and the one below it
 434
 435         mull    VAR_INVERSE        C m*(n2+n1)
 436
 437         subl    $4, %ecx
 438
 439         movl    %ecx, VAR_SRC
 440
 441         C
 442
 443         C
 444
 445         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 446         movl    %ebp, %eax         C d
 447         leal    1(%edi), %ebx      C n2+1
 448
 449         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 450         jz      L(q1_ff)
 451
 452         mull    %ebx               C (q1+1)*d
 453
 454         movl    VAR_DST, %ecx
 455         psrlq   %mm7, %mm0
 456
 457         C
 458
 459         C
 460
 461         C
 462
 463         subl    %eax, %esi
 464         movl    VAR_DST_STOP, %eax
 465
 466         sbbl    %edx, %edi         C n - (q1+1)*d
 467         movl    %esi, %edi         C remainder -> n2
 468         leal    (%ebp,%esi), %edx
 469
 470         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 471         movd    %mm0, %esi
 472
 473         sbbl    $0, %ebx           C q
 474         subl    $4, %ecx
 475
 476         movl    %ebx, (%ecx)
 477         cmpl    %eax, %ecx
 478
 479         movl    %ecx, VAR_DST
 480         jne     L(integer_top)
 481
 482
 483 L(integer_loop_done):
 484
 485
 486 C -----------------------------------------------------------------------------
 487 C
 488 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 489 C q1_ff special case.  This make the code a bit smaller and simpler, and
 490 C costs only 2 cycles (each).
 491
 492 L(integer_two_left):
 493         C eax   scratch
 494         C ebx   scratch (nadj, q1)
 495         C ecx   scratch (src, dst)
 496         C edx   scratch
 497         C esi   n10
 498         C edi   n2
 499         C ebp   divisor
 500         C
 501         C mm7   rshift
 502
 503
 504         movl    %esi, %eax
 505         movl    %ebp, %ebx
 506
 507         sarl    $31, %eax          C -n1
 508         movl    PARAM_SRC, %ecx
 509
 510         andl    %eax, %ebx         C -n1 & d
 511         negl    %eax               C n1
 512
 513         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 514         addl    %edi, %eax         C n2+n1
 515
 516         mull    VAR_INVERSE        C m*(n2+n1)
 517
 518         movd    (%ecx), %mm0       C src low limb
 519
 520         movl    VAR_DST_STOP, %ecx
 521
 522         C
 523
 524         C
 525
 526         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 527         leal    1(%edi), %ebx      C n2+1
 528         movl    %ebp, %eax         C d
 529
 530         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 531
 532         sbbl    $0, %ebx
 533
 534         mull    %ebx               C (q1+1)*d
 535
 536         psllq   $32, %mm0
 537
 538         psrlq   %mm7, %mm0
 539
 540         C
 541
 542         C
 543
 544         subl    %eax, %esi
 545
 546         sbbl    %edx, %edi         C n - (q1+1)*d
 547         movl    %esi, %edi         C remainder -> n2
 548         leal    (%ebp,%esi), %edx
 549
 550         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 551         movd    %mm0, %esi
 552
 553         sbbl    $0, %ebx           C q
 554
 555         movl    %ebx, -4(%ecx)
 556
 557
 558 C -----------------------------------------------------------------------------
 559 L(integer_one_left):
 560         C eax   scratch
 561         C ebx   scratch (nadj, q1)
 562         C ecx   scratch (dst)
 563         C edx   scratch
 564         C esi   n10
 565         C edi   n2
 566         C ebp   divisor
 567         C
 568         C mm7   rshift
 569
 570
 571         movl    %esi, %eax
 572         movl    %ebp, %ebx
 573
 574         sarl    $31, %eax          C -n1
 575         movl    VAR_DST_STOP, %ecx
 576
 577         andl    %eax, %ebx         C -n1 & d
 578         negl    %eax               C n1
 579
 580         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 581         addl    %edi, %eax         C n2+n1
 582
 583         mull    VAR_INVERSE        C m*(n2+n1)
 584
 585         C
 586
 587         C
 588
 589         C
 590
 591         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 592         leal    1(%edi), %ebx      C n2+1
 593         movl    %ebp, %eax         C d
 594
 595         C
 596
 597         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 598
 599         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 600
 601         mull    %ebx
 602
 603         C
 604
 605         C
 606
 607         C
 608
 609         C
 610
 611         subl    %eax, %esi
 612         movl    PARAM_XSIZE, %eax
 613
 614         sbbl    %edx, %edi         C n - (q1+1)*d
 615         movl    %esi, %edi         C remainder -> n2
 616         leal    (%ebp,%esi), %edx
 617
 618         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 619
 620         sbbl    $0, %ebx           C q
 621
 622         movl    %ebx, -8(%ecx)
 623         subl    $8, %ecx
 624
 625
 626
 627         orl     %eax, %eax         C xsize
 628         jnz     L(fraction_some)
 629
 630         movl    %edi, %eax
 631 L(fraction_done):
 632         movl    VAR_NORM, %ecx
 633 L(zero_done):
 634         movl    SAVE_EBP, %ebp
 635
 636         movl    SAVE_EDI, %edi
 637
 638         movl    SAVE_ESI, %esi
 639
 640         movl    SAVE_EBX, %ebx
 641         addl    $STACK_SPACE, %esp
 642
 643         shrl    %cl, %eax
 644         emms
 645
 646         ret
 647
 648
 649 C -----------------------------------------------------------------------------
 650 C
 651 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 652 C of q*d is simply -d and the remainder n-q*d = n10+d
 653
 654 L(q1_ff):
 655         C eax   (divisor)
 656         C ebx   (q1+1 == 0)
 657         C ecx
 658         C edx
 659         C esi   n10
 660         C edi   n2
 661         C ebp   divisor
 662
 663         movl    VAR_DST, %ecx
 664         movl    VAR_DST_STOP, %edx
 665         subl    $4, %ecx
 666
 667         movl    %ecx, VAR_DST
 668         psrlq   %mm7, %mm0
 669         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 670
 671         movl    $-1, (%ecx)
 672         movd    %mm0, %esi              C next n10
 673
 674         cmpl    %ecx, %edx
 675         jne     L(integer_top)
 676
 677         jmp     L(integer_loop_done)
 678
 679
 680
 681 C -----------------------------------------------------------------------------
 682 C
 683 C In the current implementation, the following successively dependent
 684 C micro-ops seem to exist.
 685 C
 686 C                      uops
 687 C               mul     5
 688 C               q1+1    1   (addl)
 689 C               mul     5
 690 C               sub     3   (negl/sbbl)
 691 C               addback 2   (cmov)
 692 C                      ---
 693 C                      16
 694 C
 695 C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
 696 C the addback was found to be a touch slower.
 697
 698
 699         ALIGN(16)
 700 L(fraction_some):
 701         C eax
 702         C ebx
 703         C ecx
 704         C edx
 705         C esi
 706         C edi   carry
 707         C ebp   divisor
 708
 709         movl    PARAM_DST, %esi
 710         movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
 711         movl    %edi, %eax
 712
 713         subl    $8, %ecx                C &dst[xsize]
 714
 715
 716         ALIGN(16)
 717 L(fraction_top):
 718         C eax   n2, then scratch
 719         C ebx   scratch (nadj, q1)
 720         C ecx   dst, decrementing
 721         C edx   scratch
 722         C esi   dst stop point
 723         C edi   n2
 724         C ebp   divisor
 725
 726         mull    VAR_INVERSE     C m*n2
 727
 728         movl    %ebp, %eax      C d
 729         subl    $4, %ecx        C dst
 730         leal    1(%edi), %ebx
 731
 732         C
 733
 734         C
 735
 736         C
 737
 738         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 739
 740         mull    %ebx            C (q1+1)*d
 741
 742         C
 743
 744         C
 745
 746         C
 747
 748         C
 749
 750         negl    %eax            C low of n - (q1+1)*d
 751
 752         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 753         leal    (%ebp,%eax), %edx
 754
 755         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 756
 757         sbbl    $0, %ebx        C q
 758         movl    %eax, %edi      C remainder->n2
 759         cmpl    %esi, %ecx
 760
 761         movl    %ebx, (%ecx)    C previous q
 762         jne     L(fraction_top)
 763
 764
 765         jmp     L(fraction_done)
 766
 767 EPILOGUE()