beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mmx / divrem_1.asm
blobcf343280bbda65594c4dc7a69b5469d457b2b95d
1 dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
2 dnl division.
4 dnl Copyright 1999-2002, 2004 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
35 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
38 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
39 C mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor);
41 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
42 C mp_srcptr src, mp_size_t size,
43 C mp_limb_t divisor, mp_limb_t carry);
44 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
45 C mp_srcptr src, mp_size_t size,
46 C mp_limb_t divisor, mp_limb_t inverse,
47 C unsigned shift);
49 C Algorithm:
51 C The method and nomenclature follow part 8 of "Division by Invariant
52 C Integers using Multiplication" by Granlund and Montgomery, reference in
53 C gmp.texi.
55 C The "and"s shown in the paper are done here with "cmov"s. "m" is written
56 C for m', and "d" for d_norm, which won't cause any confusion since it's
57 C only the normalized divisor that's of any use in the code. "b" is written
58 C for 2^N, the size of a limb, N being 32 here.
60 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
61 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If
62 C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case
63 C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then
64 C if q1==0xFFFFFFFF that must be the right value.
66 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
67 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This
68 C then goes through as normal, and finding no addback required. sbbl costs
69 C an extra cycle over what the main loop code does, but it keeps code size
70 C and complexity down.
72 C Notes:
74 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
75 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
76 C carry, since in normal circumstances that will be a very rare event.
78 C The test for skipping a division is branch free (once size>=1 is tested).
79 C The store to the destination high limb is 0 when a divide is skipped, or
80 C if it's not skipped then a copy of the src high limb is used. The latter
81 C is in case src==dst.
83 C There's a small bias towards expecting xsize==0, by having code for
84 C xsize==0 in a straight line and xsize!=0 under forward jumps.
86 C Alternatives:
88 C If the divisor is normalized (high bit set) then a division step can
89 C always be skipped, since the high destination limb is always 0 or 1 in
90 C that case. It doesn't seem worth checking for this though, since it
91 C probably occurs infrequently, in particular note that big_base for a
92 C decimal mpn_get_str is not normalized in a 32-bit limb.
95 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
96 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
97 dnl
98 dnl The inverse takes about 50 cycles to calculate, but after that the
99 dnl multiply is 17 c/l versus division at 42 c/l.
101 dnl At 3 limbs the mul is a touch faster than div on the integer part, and
102 dnl even more so on the fractional part.
104 deflit(MUL_THRESHOLD, 3)
107 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
108 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
109 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
110 defframe(PARAM_DIVISOR,20)
111 defframe(PARAM_SIZE, 16)
112 defframe(PARAM_SRC, 12)
113 defframe(PARAM_XSIZE, 8)
114 defframe(PARAM_DST, 4)
116 defframe(SAVE_EBX, -4)
117 defframe(SAVE_ESI, -8)
118 defframe(SAVE_EDI, -12)
119 defframe(SAVE_EBP, -16)
121 defframe(VAR_NORM, -20)
122 defframe(VAR_INVERSE, -24)
123 defframe(VAR_SRC, -28)
124 defframe(VAR_DST, -32)
125 defframe(VAR_DST_STOP,-36)
127 deflit(STACK_SPACE, 36)
129 TEXT
130 ALIGN(32)
132 PROLOGUE(mpn_preinv_divrem_1)
133 deflit(`FRAME',0)
134 movl PARAM_XSIZE, %ecx
135 movl PARAM_DST, %edx
136 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
138 movl %esi, SAVE_ESI
139 movl PARAM_SRC, %esi
141 movl %ebx, SAVE_EBX
142 movl PARAM_SIZE, %ebx
144 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
145 movl %ebp, SAVE_EBP
146 movl PARAM_DIVISOR, %ebp
148 movl %edx, VAR_DST_STOP C &dst[xsize+2]
149 movl %edi, SAVE_EDI
150 xorl %edi, %edi C carry
152 movl -4(%esi,%ebx,4), %eax C src high limb
153 xor %ecx, %ecx
159 cmpl %ebp, %eax C high cmp divisor
161 cmovc( %eax, %edi) C high is carry if high<divisor
162 cmovnc( %eax, %ecx) C 0 if skip div, src high if not
163 C (the latter in case src==dst)
165 movl %ecx, -12(%edx,%ebx,4) C dst high limb
166 sbbl $0, %ebx C skip one division if high<divisor
167 movl PARAM_PREINV_SHIFT, %ecx
169 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
170 movl $32, %eax
172 movl %edx, VAR_DST C &dst[xsize+size]
174 shll %cl, %ebp C d normalized
175 subl %ecx, %eax
176 movl %ecx, VAR_NORM
178 movd %eax, %mm7 C rshift
179 movl PARAM_PREINV_INVERSE, %eax
180 jmp L(start_preinv)
182 EPILOGUE()
185 ALIGN(16)
187 PROLOGUE(mpn_divrem_1c)
188 deflit(`FRAME',0)
189 movl PARAM_CARRY, %edx
190 movl PARAM_SIZE, %ecx
191 subl $STACK_SPACE, %esp
192 deflit(`FRAME',STACK_SPACE)
194 movl %ebx, SAVE_EBX
195 movl PARAM_XSIZE, %ebx
197 movl %edi, SAVE_EDI
198 movl PARAM_DST, %edi
200 movl %ebp, SAVE_EBP
201 movl PARAM_DIVISOR, %ebp
203 movl %esi, SAVE_ESI
204 movl PARAM_SRC, %esi
206 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
207 jmp L(start_1c)
209 EPILOGUE()
212 C offset 0xa1, close enough to aligned
213 PROLOGUE(mpn_divrem_1)
214 deflit(`FRAME',0)
216 movl PARAM_SIZE, %ecx
217 movl $0, %edx C initial carry (if can't skip a div)
218 subl $STACK_SPACE, %esp
219 deflit(`FRAME',STACK_SPACE)
221 movl %esi, SAVE_ESI
222 movl PARAM_SRC, %esi
224 movl %ebx, SAVE_EBX
225 movl PARAM_XSIZE, %ebx
227 movl %ebp, SAVE_EBP
228 movl PARAM_DIVISOR, %ebp
229 orl %ecx, %ecx C size
231 movl %edi, SAVE_EDI
232 movl PARAM_DST, %edi
233 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
235 jz L(no_skip_div) C if size==0
236 movl -4(%esi,%ecx,4), %eax C src high limb
237 xorl %esi, %esi
239 cmpl %ebp, %eax C high cmp divisor
241 cmovc( %eax, %edx) C high is carry if high<divisor
242 cmovnc( %eax, %esi) C 0 if skip div, src high if not
244 movl %esi, (%edi,%ecx,4) C dst high limb
245 sbbl $0, %ecx C size-1 if high<divisor
246 movl PARAM_SRC, %esi C reload
247 L(no_skip_div):
250 L(start_1c):
251 C eax
252 C ebx xsize
253 C ecx size
254 C edx carry
255 C esi src
256 C edi &dst[xsize-1]
257 C ebp divisor
259 leal (%ebx,%ecx), %eax C size+xsize
260 cmpl $MUL_THRESHOLD, %eax
261 jae L(mul_by_inverse)
264 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
265 C It'd be possible to write them out without the looping, but no speedup
266 C would be expected.
268 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
269 C integer part, but curiously not on the fractional part, where %ebp is a
270 C (fixed) couple of cycles faster.
272 orl %ecx, %ecx
273 jz L(divide_no_integer)
275 L(divide_integer):
276 C eax scratch (quotient)
277 C ebx xsize
278 C ecx counter
279 C edx scratch (remainder)
280 C esi src
281 C edi &dst[xsize-1]
282 C ebp divisor
284 movl -4(%esi,%ecx,4), %eax
286 divl PARAM_DIVISOR
288 movl %eax, (%edi,%ecx,4)
289 decl %ecx
290 jnz L(divide_integer)
293 L(divide_no_integer):
294 movl PARAM_DST, %edi
295 orl %ebx, %ebx
296 jnz L(divide_fraction)
298 L(divide_done):
299 movl SAVE_ESI, %esi
300 movl SAVE_EDI, %edi
301 movl %edx, %eax
303 movl SAVE_EBX, %ebx
304 movl SAVE_EBP, %ebp
305 addl $STACK_SPACE, %esp
310 L(divide_fraction):
311 C eax scratch (quotient)
312 C ebx counter
313 C ecx
314 C edx scratch (remainder)
315 C esi
316 C edi dst
317 C ebp divisor
319 movl $0, %eax
321 divl %ebp
323 movl %eax, -4(%edi,%ebx,4)
324 decl %ebx
325 jnz L(divide_fraction)
327 jmp L(divide_done)
331 C -----------------------------------------------------------------------------
333 L(mul_by_inverse):
334 C eax
335 C ebx xsize
336 C ecx size
337 C edx carry
338 C esi src
339 C edi &dst[xsize-1]
340 C ebp divisor
342 bsrl %ebp, %eax C 31-l
344 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
345 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
347 movl %edi, VAR_DST
348 movl %ebx, VAR_DST_STOP
350 movl %ecx, %ebx C size
351 movl $31, %ecx
353 movl %edx, %edi C carry
354 movl $-1, %edx
358 xorl %eax, %ecx C l
359 incl %eax C 32-l
361 shll %cl, %ebp C d normalized
362 movl %ecx, VAR_NORM
364 movd %eax, %mm7
366 movl $-1, %eax
367 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
369 divl %ebp C floor (b*(b-d)-1) / d
371 L(start_preinv):
372 C eax inverse
373 C ebx size
374 C ecx shift
375 C edx
376 C esi src
377 C edi carry
378 C ebp divisor
380 C mm7 rshift
382 orl %ebx, %ebx C size
383 movl %eax, VAR_INVERSE
384 leal -12(%esi,%ebx,4), %eax C &src[size-3]
386 jz L(start_zero)
387 movl %eax, VAR_SRC
388 cmpl $1, %ebx
390 movl 8(%eax), %esi C src high limb
391 jz L(start_one)
393 L(start_two_or_more):
394 movl 4(%eax), %edx C src second highest limb
396 shldl( %cl, %esi, %edi) C n2 = carry,high << l
398 shldl( %cl, %edx, %esi) C n10 = high,second << l
400 cmpl $2, %ebx
401 je L(integer_two_left)
402 jmp L(integer_top)
405 L(start_one):
406 shldl( %cl, %esi, %edi) C n2 = carry,high << l
408 shll %cl, %esi C n10 = high << l
409 movl %eax, VAR_SRC
410 jmp L(integer_one_left)
413 L(start_zero):
414 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
415 C skipped a division.
417 shll %cl, %edi C n2 = carry << l
418 movl %edi, %eax C return value for zero_done
419 cmpl $0, PARAM_XSIZE
421 je L(zero_done)
422 jmp L(fraction_some)
426 C -----------------------------------------------------------------------------
428 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
429 C execution. The instruction scheduling is important, with various
430 C apparently equivalent forms running 1 to 5 cycles slower.
432 C A lower bound for the time would seem to be 16 cycles, based on the
433 C following successive dependencies.
435 C cycles
436 C n2+n1 1
437 C mul 6
438 C q1+1 1
439 C mul 6
440 C sub 1
441 C addback 1
442 C ---
443 C 16
445 C This chain is what the loop has already, but 16 cycles isn't achieved.
446 C K7 has enough decode, and probably enough execute (depending maybe on what
447 C a mul actually consumes), but nothing running under 17 has been found.
449 C In theory n2+n1 could be done in the sub and addback stages (by
450 C calculating both n2 and n2+n1 there), but lack of registers makes this an
451 C unlikely proposition.
453 C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
454 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
455 C chain, and nothing better than 18 cycles has been found when using it.
456 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
457 C be an extremely rare event.
459 C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
460 C if some special data is coming out with this always, the q1_ff special
461 C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
462 C induce the q1_ff case, for speed measurements or testing. Note that
463 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
465 C The instruction groupings and empty comments show the cycles for a naive
466 C in-order view of the code (conveniently ignoring the load latency on
467 C VAR_INVERSE). This shows some of where the time is going, but is nonsense
468 C to the extent that out-of-order execution rearranges it. In this case
469 C there's 19 cycles shown, but it executes at 17.
471 ALIGN(16)
472 L(integer_top):
473 C eax scratch
474 C ebx scratch (nadj, q1)
475 C ecx scratch (src, dst)
476 C edx scratch
477 C esi n10
478 C edi n2
479 C ebp divisor
481 C mm0 scratch (src qword)
482 C mm7 rshift for normalization
484 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
485 movl %edi, %eax C n2
486 movl VAR_SRC, %ecx
488 leal (%ebp,%esi), %ebx
489 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
490 sbbl $-1, %eax C n2+n1
492 mull VAR_INVERSE C m*(n2+n1)
494 movq (%ecx), %mm0 C next limb and the one below it
495 subl $4, %ecx
497 movl %ecx, VAR_SRC
501 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
502 leal 1(%edi), %ebx C n2+1
503 movl %ebp, %eax C d
507 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
508 jz L(q1_ff)
509 movl VAR_DST, %ecx
511 mull %ebx C (q1+1)*d
513 psrlq %mm7, %mm0
515 leal -4(%ecx), %ecx
519 subl %eax, %esi
520 movl VAR_DST_STOP, %eax
524 sbbl %edx, %edi C n - (q1+1)*d
525 movl %esi, %edi C remainder -> n2
526 leal (%ebp,%esi), %edx
528 movd %mm0, %esi
530 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
531 sbbl $0, %ebx C q
532 cmpl %eax, %ecx
534 movl %ebx, (%ecx)
535 movl %ecx, VAR_DST
536 jne L(integer_top)
539 L(integer_loop_done):
542 C -----------------------------------------------------------------------------
544 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
545 C q1_ff special case. This make the code a bit smaller and simpler, and
546 C costs only 1 cycle (each).
548 L(integer_two_left):
549 C eax scratch
550 C ebx scratch (nadj, q1)
551 C ecx scratch (src, dst)
552 C edx scratch
553 C esi n10
554 C edi n2
555 C ebp divisor
557 C mm7 rshift
559 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
560 movl %edi, %eax C n2
561 movl PARAM_SRC, %ecx
563 leal (%ebp,%esi), %ebx
564 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
565 sbbl $-1, %eax C n2+n1
567 mull VAR_INVERSE C m*(n2+n1)
569 movd (%ecx), %mm0 C src low limb
571 movl VAR_DST_STOP, %ecx
575 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
576 leal 1(%edi), %ebx C n2+1
577 movl %ebp, %eax C d
579 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
581 sbbl $0, %ebx
583 mull %ebx C (q1+1)*d
585 psllq $32, %mm0
587 psrlq %mm7, %mm0
591 subl %eax, %esi
595 sbbl %edx, %edi C n - (q1+1)*d
596 movl %esi, %edi C remainder -> n2
597 leal (%ebp,%esi), %edx
599 movd %mm0, %esi
601 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
602 sbbl $0, %ebx C q
604 movl %ebx, -4(%ecx)
607 C -----------------------------------------------------------------------------
608 L(integer_one_left):
609 C eax scratch
610 C ebx scratch (nadj, q1)
611 C ecx dst
612 C edx scratch
613 C esi n10
614 C edi n2
615 C ebp divisor
617 C mm7 rshift
619 movl VAR_DST_STOP, %ecx
620 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
621 movl %edi, %eax C n2
623 leal (%ebp,%esi), %ebx
624 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
625 sbbl $-1, %eax C n2+n1
627 mull VAR_INVERSE C m*(n2+n1)
635 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
636 leal 1(%edi), %ebx C n2+1
637 movl %ebp, %eax C d
641 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
643 sbbl $0, %ebx C q1 if q1+1 overflowed
645 mull %ebx
653 subl %eax, %esi
657 sbbl %edx, %edi C n - (q1+1)*d
658 movl %esi, %edi C remainder -> n2
659 leal (%ebp,%esi), %edx
661 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
662 sbbl $0, %ebx C q
664 movl %ebx, -8(%ecx)
665 subl $8, %ecx
669 L(integer_none):
670 cmpl $0, PARAM_XSIZE
671 jne L(fraction_some)
673 movl %edi, %eax
674 L(fraction_done):
675 movl VAR_NORM, %ecx
676 L(zero_done):
677 movl SAVE_EBP, %ebp
679 movl SAVE_EDI, %edi
680 movl SAVE_ESI, %esi
682 movl SAVE_EBX, %ebx
683 addl $STACK_SPACE, %esp
685 shrl %cl, %eax
686 emms
691 C -----------------------------------------------------------------------------
693 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
694 C of q*d is simply -d and the remainder n-q*d = n10+d
696 L(q1_ff):
697 C eax (divisor)
698 C ebx (q1+1 == 0)
699 C ecx
700 C edx
701 C esi n10
702 C edi n2
703 C ebp divisor
705 movl VAR_DST, %ecx
706 movl VAR_DST_STOP, %edx
707 subl $4, %ecx
709 psrlq %mm7, %mm0
710 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
711 movl %ecx, VAR_DST
713 movd %mm0, %esi C next n10
715 movl $-1, (%ecx)
716 cmpl %ecx, %edx
717 jne L(integer_top)
719 jmp L(integer_loop_done)
723 C -----------------------------------------------------------------------------
725 C Being the fractional part, the "source" limbs are all zero, meaning
726 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
728 C The loop runs at 15 cycles. The dependent chain is the same as the
729 C general case above, but without the n2+n1 stage (due to n1==0), so 15
730 C would seem to be the lower bound.
732 C A not entirely obvious simplification is that q1+1 never overflows a limb,
733 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
734 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
735 C rnd() means rounding down to a multiple of d.
737 C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
738 C = m*d + b*d - m - b
739 C = floor((b(b-d)-1)/d)*d + b*d - m - b
740 C = rnd(b(b-d)-1) + b*d - m - b
741 C = rnd(b(b-d)-1 + b*d) - m - b
742 C = rnd(b*b-1) - m - b
743 C <= (b-2)*b
745 C Unchanged from the general case is that the final quotient limb q can be
746 C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
747 C equation 8.4 of the paper which simplifies as follows when n1==0 and
748 C n0==0.
750 C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
752 C As before, the instruction groupings and empty comments show a naive
753 C in-order view of the code, which is made a nonsense by out of order
754 C execution. There's 17 cycles shown, but it executes at 15.
756 C Rotating the store q and remainder->n2 instructions up to the top of the
757 C loop gets the run time down from 16 to 15.
759 ALIGN(16)
760 L(fraction_some):
761 C eax
762 C ebx
763 C ecx
764 C edx
765 C esi
766 C edi carry
767 C ebp divisor
769 movl PARAM_DST, %esi
770 movl VAR_DST_STOP, %ecx C &dst[xsize+2]
771 movl %edi, %eax
773 subl $8, %ecx C &dst[xsize]
774 jmp L(fraction_entry)
777 ALIGN(16)
778 L(fraction_top):
779 C eax n2 carry, then scratch
780 C ebx scratch (nadj, q1)
781 C ecx dst, decrementing
782 C edx scratch
783 C esi dst stop point
784 C edi (will be n2)
785 C ebp divisor
787 movl %ebx, (%ecx) C previous q
788 movl %eax, %edi C remainder->n2
790 L(fraction_entry):
791 mull VAR_INVERSE C m*n2
793 movl %ebp, %eax C d
794 subl $4, %ecx C dst
795 leal 1(%edi), %ebx
805 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
807 mull %ebx C (q1+1)*d
815 negl %eax C low of n - (q1+1)*d
819 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
820 leal (%ebp,%eax), %edx
822 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
823 sbbl $0, %ebx C q
824 cmpl %esi, %ecx
826 jne L(fraction_top)
829 movl %ebx, (%ecx)
830 jmp L(fraction_done)
832 EPILOGUE()