beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / p6 / mmx / divrem_1.asm
blob5300616c14d9d0848ed28f5ebebe8a6647f4064d
1 dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38 C mp_srcptr src, mp_size_t size,
39 C mp_limb_t divisor);
40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
41 C mp_srcptr src, mp_size_t size,
42 C mp_limb_t divisor, mp_limb_t carry);
43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
44 C mp_srcptr src, mp_size_t size,
45 C mp_limb_t divisor, mp_limb_t inverse,
46 C unsigned shift);
48 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
49 C see that file for some comments. It's possible what's here can be improved.
52 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
53 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
54 dnl
55 dnl The different speeds of the integer and fraction parts means that using
56 dnl xsize+size isn't quite right. The threshold wants to be a bit higher
57 dnl for the integer part and a bit lower for the fraction part. (Or what's
58 dnl really wanted is to speed up the integer part!)
59 dnl
60 dnl The threshold is set to make the integer part right. At 4 limbs the
61 dnl div and mul are about the same there, but on the fractional part the
62 dnl mul is much faster.
64 deflit(MUL_THRESHOLD, 4)
67 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
68 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
69 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
70 defframe(PARAM_DIVISOR,20)
71 defframe(PARAM_SIZE, 16)
72 defframe(PARAM_SRC, 12)
73 defframe(PARAM_XSIZE, 8)
74 defframe(PARAM_DST, 4)
76 defframe(SAVE_EBX, -4)
77 defframe(SAVE_ESI, -8)
78 defframe(SAVE_EDI, -12)
79 defframe(SAVE_EBP, -16)
81 defframe(VAR_NORM, -20)
82 defframe(VAR_INVERSE, -24)
83 defframe(VAR_SRC, -28)
84 defframe(VAR_DST, -32)
85 defframe(VAR_DST_STOP,-36)
87 deflit(STACK_SPACE, 36)
89 TEXT
90 ALIGN(16)
92 PROLOGUE(mpn_preinv_divrem_1)
93 deflit(`FRAME',0)
94 movl PARAM_XSIZE, %ecx
95 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
97 movl %esi, SAVE_ESI
98 movl PARAM_SRC, %esi
100 movl %ebx, SAVE_EBX
101 movl PARAM_SIZE, %ebx
103 movl %ebp, SAVE_EBP
104 movl PARAM_DIVISOR, %ebp
106 movl %edi, SAVE_EDI
107 movl PARAM_DST, %edx
109 movl -4(%esi,%ebx,4), %eax C src high limb
110 xorl %edi, %edi C initial carry (if can't skip a div)
114 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
115 xor %ecx, %ecx
117 movl %edx, VAR_DST_STOP C &dst[xsize+2]
118 cmpl %ebp, %eax C high cmp divisor
120 cmovc( %eax, %edi) C high is carry if high<divisor
122 cmovnc( %eax, %ecx) C 0 if skip div, src high if not
123 C (the latter in case src==dst)
125 movl %ecx, -12(%edx,%ebx,4) C dst high limb
127 sbbl $0, %ebx C skip one division if high<divisor
128 movl PARAM_PREINV_SHIFT, %ecx
130 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
131 movl $32, %eax
133 movl %edx, VAR_DST C &dst[xsize+size]
135 shll %cl, %ebp C d normalized
136 subl %ecx, %eax
137 movl %ecx, VAR_NORM
139 movd %eax, %mm7 C rshift
140 movl PARAM_PREINV_INVERSE, %eax
141 jmp L(start_preinv)
143 EPILOGUE()
147 ALIGN(16)
149 PROLOGUE(mpn_divrem_1c)
150 deflit(`FRAME',0)
151 movl PARAM_CARRY, %edx
153 movl PARAM_SIZE, %ecx
154 subl $STACK_SPACE, %esp
155 deflit(`FRAME',STACK_SPACE)
157 movl %ebx, SAVE_EBX
158 movl PARAM_XSIZE, %ebx
160 movl %edi, SAVE_EDI
161 movl PARAM_DST, %edi
163 movl %ebp, SAVE_EBP
164 movl PARAM_DIVISOR, %ebp
166 movl %esi, SAVE_ESI
167 movl PARAM_SRC, %esi
169 leal -4(%edi,%ebx,4), %edi
170 jmp L(start_1c)
172 EPILOGUE()
175 C offset 0x31, close enough to aligned
176 PROLOGUE(mpn_divrem_1)
177 deflit(`FRAME',0)
179 movl PARAM_SIZE, %ecx
180 movl $0, %edx C initial carry (if can't skip a div)
181 subl $STACK_SPACE, %esp
182 deflit(`FRAME',STACK_SPACE)
184 movl %ebp, SAVE_EBP
185 movl PARAM_DIVISOR, %ebp
187 movl %ebx, SAVE_EBX
188 movl PARAM_XSIZE, %ebx
190 movl %esi, SAVE_ESI
191 movl PARAM_SRC, %esi
192 orl %ecx, %ecx C size
194 movl %edi, SAVE_EDI
195 movl PARAM_DST, %edi
197 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
198 jz L(no_skip_div) C if size==0
200 movl -4(%esi,%ecx,4), %eax C src high limb
201 xorl %esi, %esi
202 cmpl %ebp, %eax C high cmp divisor
204 cmovc( %eax, %edx) C high is carry if high<divisor
206 cmovnc( %eax, %esi) C 0 if skip div, src high if not
207 C (the latter in case src==dst)
209 movl %esi, (%edi,%ecx,4) C dst high limb
211 sbbl $0, %ecx C size-1 if high<divisor
212 movl PARAM_SRC, %esi C reload
213 L(no_skip_div):
216 L(start_1c):
217 C eax
218 C ebx xsize
219 C ecx size
220 C edx carry
221 C esi src
222 C edi &dst[xsize-1]
223 C ebp divisor
225 leal (%ebx,%ecx), %eax C size+xsize
226 cmpl $MUL_THRESHOLD, %eax
227 jae L(mul_by_inverse)
229 orl %ecx, %ecx
230 jz L(divide_no_integer)
232 L(divide_integer):
233 C eax scratch (quotient)
234 C ebx xsize
235 C ecx counter
236 C edx scratch (remainder)
237 C esi src
238 C edi &dst[xsize-1]
239 C ebp divisor
241 movl -4(%esi,%ecx,4), %eax
243 divl %ebp
245 movl %eax, (%edi,%ecx,4)
246 decl %ecx
247 jnz L(divide_integer)
250 L(divide_no_integer):
251 movl PARAM_DST, %edi
252 orl %ebx, %ebx
253 jnz L(divide_fraction)
255 L(divide_done):
256 movl SAVE_ESI, %esi
258 movl SAVE_EDI, %edi
260 movl SAVE_EBX, %ebx
261 movl %edx, %eax
263 movl SAVE_EBP, %ebp
264 addl $STACK_SPACE, %esp
269 L(divide_fraction):
270 C eax scratch (quotient)
271 C ebx counter
272 C ecx
273 C edx scratch (remainder)
274 C esi
275 C edi dst
276 C ebp divisor
278 movl $0, %eax
280 divl %ebp
282 movl %eax, -4(%edi,%ebx,4)
283 decl %ebx
284 jnz L(divide_fraction)
286 jmp L(divide_done)
290 C -----------------------------------------------------------------------------
292 L(mul_by_inverse):
293 C eax
294 C ebx xsize
295 C ecx size
296 C edx carry
297 C esi src
298 C edi &dst[xsize-1]
299 C ebp divisor
301 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
303 movl %ebx, VAR_DST_STOP
304 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
306 movl %edi, VAR_DST
307 movl %ecx, %ebx C size
309 bsrl %ebp, %ecx C 31-l
310 movl %edx, %edi C carry
312 leal 1(%ecx), %eax C 32-l
313 xorl $31, %ecx C l
315 movl %ecx, VAR_NORM
316 movl $-1, %edx
318 shll %cl, %ebp C d normalized
319 movd %eax, %mm7
321 movl $-1, %eax
322 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
324 divl %ebp C floor (b*(b-d)-1) / d
326 L(start_preinv):
327 C eax inverse
328 C ebx size
329 C ecx shift
330 C edx
331 C esi src
332 C edi carry
333 C ebp divisor
335 C mm7 rshift
337 movl %eax, VAR_INVERSE
338 orl %ebx, %ebx C size
339 leal -12(%esi,%ebx,4), %eax C &src[size-3]
341 movl %eax, VAR_SRC
342 jz L(start_zero)
344 movl 8(%eax), %esi C src high limb
345 cmpl $1, %ebx
346 jz L(start_one)
348 L(start_two_or_more):
349 movl 4(%eax), %edx C src second highest limb
351 shldl( %cl, %esi, %edi) C n2 = carry,high << l
353 shldl( %cl, %edx, %esi) C n10 = high,second << l
355 cmpl $2, %ebx
356 je L(integer_two_left)
357 jmp L(integer_top)
360 L(start_one):
361 shldl( %cl, %esi, %edi) C n2 = carry,high << l
363 shll %cl, %esi C n10 = high << l
364 jmp L(integer_one_left)
367 L(start_zero):
368 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
369 C skipped a division.
371 shll %cl, %edi C n2 = carry << l
372 movl %edi, %eax C return value for zero_done
373 cmpl $0, PARAM_XSIZE
375 je L(zero_done)
376 jmp L(fraction_some)
380 C -----------------------------------------------------------------------------
382 C This loop runs at about 25 cycles, which is probably sub-optimal, and
383 C certainly more than the dependent chain would suggest. A better loop, or
384 C a better rough analysis of what's possible, would be welcomed.
386 C In the current implementation, the following successively dependent
387 C micro-ops seem to exist.
389 C uops
390 C n2+n1 1 (addl)
391 C mul 5
392 C q1+1 3 (addl/adcl)
393 C mul 5
394 C sub 3 (subl/sbbl)
395 C addback 2 (cmov)
396 C ---
397 C 19
399 C Lack of registers hinders explicit scheduling and it might be that the
400 C normal out of order execution isn't able to hide enough under the mul
401 C latencies.
403 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
404 C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
405 C combination was tried for the addback (despite the fact it would lengthen
406 C the dependent chain) but found to be no faster.
409 ALIGN(16)
410 L(integer_top):
411 C eax scratch
412 C ebx scratch (nadj, q1)
413 C ecx scratch (src, dst)
414 C edx scratch
415 C esi n10
416 C edi n2
417 C ebp d
419 C mm0 scratch (src qword)
420 C mm7 rshift for normalization
422 movl %esi, %eax
423 movl %ebp, %ebx
425 sarl $31, %eax C -n1
426 movl VAR_SRC, %ecx
428 andl %eax, %ebx C -n1 & d
429 negl %eax C n1
431 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
432 addl %edi, %eax C n2+n1
433 movq (%ecx), %mm0 C next src limb and the one below it
435 mull VAR_INVERSE C m*(n2+n1)
437 subl $4, %ecx
439 movl %ecx, VAR_SRC
445 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
446 movl %ebp, %eax C d
447 leal 1(%edi), %ebx C n2+1
449 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
450 jz L(q1_ff)
452 mull %ebx C (q1+1)*d
454 movl VAR_DST, %ecx
455 psrlq %mm7, %mm0
463 subl %eax, %esi
464 movl VAR_DST_STOP, %eax
466 sbbl %edx, %edi C n - (q1+1)*d
467 movl %esi, %edi C remainder -> n2
468 leal (%ebp,%esi), %edx
470 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
471 movd %mm0, %esi
473 sbbl $0, %ebx C q
474 subl $4, %ecx
476 movl %ebx, (%ecx)
477 cmpl %eax, %ecx
479 movl %ecx, VAR_DST
480 jne L(integer_top)
483 L(integer_loop_done):
486 C -----------------------------------------------------------------------------
488 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
489 C q1_ff special case. This make the code a bit smaller and simpler, and
490 C costs only 2 cycles (each).
492 L(integer_two_left):
493 C eax scratch
494 C ebx scratch (nadj, q1)
495 C ecx scratch (src, dst)
496 C edx scratch
497 C esi n10
498 C edi n2
499 C ebp divisor
501 C mm7 rshift
504 movl %esi, %eax
505 movl %ebp, %ebx
507 sarl $31, %eax C -n1
508 movl PARAM_SRC, %ecx
510 andl %eax, %ebx C -n1 & d
511 negl %eax C n1
513 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
514 addl %edi, %eax C n2+n1
516 mull VAR_INVERSE C m*(n2+n1)
518 movd (%ecx), %mm0 C src low limb
520 movl VAR_DST_STOP, %ecx
526 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
527 leal 1(%edi), %ebx C n2+1
528 movl %ebp, %eax C d
530 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
532 sbbl $0, %ebx
534 mull %ebx C (q1+1)*d
536 psllq $32, %mm0
538 psrlq %mm7, %mm0
544 subl %eax, %esi
546 sbbl %edx, %edi C n - (q1+1)*d
547 movl %esi, %edi C remainder -> n2
548 leal (%ebp,%esi), %edx
550 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
551 movd %mm0, %esi
553 sbbl $0, %ebx C q
555 movl %ebx, -4(%ecx)
558 C -----------------------------------------------------------------------------
559 L(integer_one_left):
560 C eax scratch
561 C ebx scratch (nadj, q1)
562 C ecx scratch (dst)
563 C edx scratch
564 C esi n10
565 C edi n2
566 C ebp divisor
568 C mm7 rshift
571 movl %esi, %eax
572 movl %ebp, %ebx
574 sarl $31, %eax C -n1
575 movl VAR_DST_STOP, %ecx
577 andl %eax, %ebx C -n1 & d
578 negl %eax C n1
580 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
581 addl %edi, %eax C n2+n1
583 mull VAR_INVERSE C m*(n2+n1)
591 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
592 leal 1(%edi), %ebx C n2+1
593 movl %ebp, %eax C d
597 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
599 sbbl $0, %ebx C q1 if q1+1 overflowed
601 mull %ebx
611 subl %eax, %esi
612 movl PARAM_XSIZE, %eax
614 sbbl %edx, %edi C n - (q1+1)*d
615 movl %esi, %edi C remainder -> n2
616 leal (%ebp,%esi), %edx
618 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
620 sbbl $0, %ebx C q
622 movl %ebx, -8(%ecx)
623 subl $8, %ecx
627 orl %eax, %eax C xsize
628 jnz L(fraction_some)
630 movl %edi, %eax
631 L(fraction_done):
632 movl VAR_NORM, %ecx
633 L(zero_done):
634 movl SAVE_EBP, %ebp
636 movl SAVE_EDI, %edi
638 movl SAVE_ESI, %esi
640 movl SAVE_EBX, %ebx
641 addl $STACK_SPACE, %esp
643 shrl %cl, %eax
644 emms
649 C -----------------------------------------------------------------------------
651 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
652 C of q*d is simply -d and the remainder n-q*d = n10+d
654 L(q1_ff):
655 C eax (divisor)
656 C ebx (q1+1 == 0)
657 C ecx
658 C edx
659 C esi n10
660 C edi n2
661 C ebp divisor
663 movl VAR_DST, %ecx
664 movl VAR_DST_STOP, %edx
665 subl $4, %ecx
667 movl %ecx, VAR_DST
668 psrlq %mm7, %mm0
669 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
671 movl $-1, (%ecx)
672 movd %mm0, %esi C next n10
674 cmpl %ecx, %edx
675 jne L(integer_top)
677 jmp L(integer_loop_done)
681 C -----------------------------------------------------------------------------
683 C In the current implementation, the following successively dependent
684 C micro-ops seem to exist.
686 C uops
687 C mul 5
688 C q1+1 1 (addl)
689 C mul 5
690 C sub 3 (negl/sbbl)
691 C addback 2 (cmov)
692 C ---
693 C 16
695 C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
696 C the addback was found to be a touch slower.
699 ALIGN(16)
700 L(fraction_some):
701 C eax
702 C ebx
703 C ecx
704 C edx
705 C esi
706 C edi carry
707 C ebp divisor
709 movl PARAM_DST, %esi
710 movl VAR_DST_STOP, %ecx C &dst[xsize+2]
711 movl %edi, %eax
713 subl $8, %ecx C &dst[xsize]
716 ALIGN(16)
717 L(fraction_top):
718 C eax n2, then scratch
719 C ebx scratch (nadj, q1)
720 C ecx dst, decrementing
721 C edx scratch
722 C esi dst stop point
723 C edi n2
724 C ebp divisor
726 mull VAR_INVERSE C m*n2
728 movl %ebp, %eax C d
729 subl $4, %ecx C dst
730 leal 1(%edi), %ebx
738 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
740 mull %ebx C (q1+1)*d
750 negl %eax C low of n - (q1+1)*d
752 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
753 leal (%ebp,%eax), %edx
755 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
757 sbbl $0, %ebx C q
758 movl %eax, %edi C remainder->n2
759 cmpl %esi, %ecx
761 movl %ebx, (%ecx) C previous q
762 jne L(fraction_top)
765 jmp L(fraction_done)
767 EPILOGUE()