beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium4 / sse2 / divrem_1.asm
blob0146fab1177e0dbac80e254efeb769b1a6649749
1 dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 1999-2004 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
37 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
38 C mp_srcptr src, mp_size_t size,
39 C mp_limb_t divisor);
40 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
41 C mp_srcptr src, mp_size_t size,
42 C mp_limb_t divisor, mp_limb_t carry);
43 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
44 C mp_srcptr src, mp_size_t size,
45 C mp_limb_t divisor, mp_limb_t inverse,
46 C unsigned shift);
48 C Algorithm:
50 C The method and nomenclature follow part 8 of "Division by Invariant
51 C Integers using Multiplication" by Granlund and Montgomery, reference in
52 C gmp.texi.
54 C "m" is written for what is m' in the paper, and "d" for d_norm, which
55 C won't cause any confusion since it's only the normalized divisor that's of
56 C any use in the code. "b" is written for 2^N, the size of a limb, N being
57 C 32 here.
59 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
60 C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
61 C us have just a psubq on the dependent chain.
63 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
64 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
66 C Notes:
68 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
69 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
70 C carry, since in normal circumstances that will be a very rare event.
72 C The test for skipping a division is branch free (once size>=1 is tested).
73 C The store to the destination high limb is 0 when a divide is skipped, or
74 C if it's not skipped then a copy of the src high limb is stored. The
75 C latter is in case src==dst.
77 C There's a small bias towards expecting xsize==0, by having code for
78 C xsize==0 in a straight line and xsize!=0 under forward jumps.
80 C Enhancements:
82 C The loop measures 32 cycles, but the dependent chain would suggest it
83 C could be done with 30. Not sure where to start looking for the extras.
85 C Alternatives:
87 C If the divisor is normalized (high bit set) then a division step can
88 C always be skipped, since the high destination limb is always 0 or 1 in
89 C that case. It doesn't seem worth checking for this though, since it
90 C probably occurs infrequently.
93 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
94 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
95 dnl
96 dnl The inverse takes about 80-90 cycles to calculate, but after that the
97 dnl multiply is 32 c/l versus division at about 58 c/l.
98 dnl
99 dnl At 4 limbs the div is a touch faster than the mul (and of course
100 dnl simpler), so start the mul from 5 limbs.
102 deflit(MUL_THRESHOLD, 5)
105 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
106 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
107 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
108 defframe(PARAM_DIVISOR,20)
109 defframe(PARAM_SIZE, 16)
110 defframe(PARAM_SRC, 12)
111 defframe(PARAM_XSIZE, 8)
112 defframe(PARAM_DST, 4)
114 dnl re-use parameter space
115 define(SAVE_ESI,`PARAM_SIZE')
116 define(SAVE_EBP,`PARAM_SRC')
117 define(SAVE_EDI,`PARAM_DIVISOR')
118 define(SAVE_EBX,`PARAM_DST')
120 TEXT
122 ALIGN(16)
123 PROLOGUE(mpn_preinv_divrem_1)
124 deflit(`FRAME',0)
126 movl PARAM_SIZE, %ecx
127 xorl %edx, %edx C carry if can't skip a div
129 movl %esi, SAVE_ESI
130 movl PARAM_SRC, %esi
132 movl %ebp, SAVE_EBP
133 movl PARAM_DIVISOR, %ebp
135 movl %edi, SAVE_EDI
136 movl PARAM_DST, %edi
138 movl -4(%esi,%ecx,4), %eax C src high limb
140 movl %ebx, SAVE_EBX
141 movl PARAM_XSIZE, %ebx
143 movd PARAM_PREINV_INVERSE, %mm4
145 movd PARAM_PREINV_SHIFT, %mm7 C l
146 cmpl %ebp, %eax C high cmp divisor
148 cmovc( %eax, %edx) C high is carry if high<divisor
149 movd %edx, %mm0 C carry
151 movd %edx, %mm1 C carry
152 movl $0, %edx
154 movd %ebp, %mm5 C d
155 cmovnc( %eax, %edx) C 0 if skip div, src high if not
156 C (the latter in case src==dst)
157 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
159 movl %edx, (%edi,%ecx,4) C dst high limb
160 sbbl $0, %ecx C skip one division if high<divisor
161 movl $32, %eax
163 subl PARAM_PREINV_SHIFT, %eax
164 psllq %mm7, %mm5 C d normalized
165 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
166 leal -4(%esi,%ecx,4), %esi C &src[size-1]
168 movd %eax, %mm6 C 32-l
169 jmp L(start_preinv)
171 EPILOGUE()
174 ALIGN(16)
175 PROLOGUE(mpn_divrem_1c)
176 deflit(`FRAME',0)
178 movl PARAM_CARRY, %edx
180 movl PARAM_SIZE, %ecx
182 movl %esi, SAVE_ESI
183 movl PARAM_SRC, %esi
185 movl %ebp, SAVE_EBP
186 movl PARAM_DIVISOR, %ebp
188 movl %edi, SAVE_EDI
189 movl PARAM_DST, %edi
191 movl %ebx, SAVE_EBX
192 movl PARAM_XSIZE, %ebx
194 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
195 jmp L(start_1c)
197 EPILOGUE()
200 ALIGN(16)
201 PROLOGUE(mpn_divrem_1)
202 deflit(`FRAME',0)
204 movl PARAM_SIZE, %ecx
205 xorl %edx, %edx C initial carry (if can't skip a div)
207 movl %esi, SAVE_ESI
208 movl PARAM_SRC, %esi
210 movl %ebp, SAVE_EBP
211 movl PARAM_DIVISOR, %ebp
213 movl %edi, SAVE_EDI
214 movl PARAM_DST, %edi
216 movl %ebx, SAVE_EBX
217 movl PARAM_XSIZE, %ebx
218 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
220 orl %ecx, %ecx C size
221 jz L(no_skip_div) C if size==0
222 movl -4(%esi,%ecx,4), %eax C src high limb
224 cmpl %ebp, %eax C high cmp divisor
226 cmovnc( %eax, %edx) C 0 if skip div, src high if not
227 movl %edx, (%edi,%ecx,4) C dst high limb
229 movl $0, %edx
230 cmovc( %eax, %edx) C high is carry if high<divisor
232 sbbl $0, %ecx C size-1 if high<divisor
233 L(no_skip_div):
236 L(start_1c):
237 C eax
238 C ebx xsize
239 C ecx size
240 C edx carry
241 C esi src
242 C edi &dst[xsize-1]
243 C ebp divisor
245 leal (%ebx,%ecx), %eax C size+xsize
246 leal -4(%esi,%ecx,4), %esi C &src[size-1]
247 leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
249 cmpl $MUL_THRESHOLD, %eax
250 jae L(mul_by_inverse)
253 orl %ecx, %ecx
254 jz L(divide_no_integer) C if size==0
256 L(divide_integer):
257 C eax scratch (quotient)
258 C ebx xsize
259 C ecx counter
260 C edx carry
261 C esi src, decrementing
262 C edi dst, decrementing
263 C ebp divisor
265 movl (%esi), %eax
266 subl $4, %esi
268 divl %ebp
270 movl %eax, (%edi)
271 subl $4, %edi
273 subl $1, %ecx
274 jnz L(divide_integer)
277 L(divide_no_integer):
278 orl %ebx, %ebx
279 jnz L(divide_fraction) C if xsize!=0
281 L(divide_done):
282 movl SAVE_ESI, %esi
283 movl SAVE_EDI, %edi
284 movl SAVE_EBX, %ebx
285 movl SAVE_EBP, %ebp
286 movl %edx, %eax
290 L(divide_fraction):
291 C eax scratch (quotient)
292 C ebx counter
293 C ecx
294 C edx carry
295 C esi
296 C edi dst, decrementing
297 C ebp divisor
299 movl $0, %eax
301 divl %ebp
303 movl %eax, (%edi)
304 subl $4, %edi
306 subl $1, %ebx
307 jnz L(divide_fraction)
309 jmp L(divide_done)
313 C -----------------------------------------------------------------------------
315 L(mul_by_inverse):
316 C eax
317 C ebx xsize
318 C ecx size
319 C edx carry
320 C esi &src[size-1]
321 C edi &dst[size+xsize-1]
322 C ebp divisor
324 bsrl %ebp, %eax C 31-l
325 movd %edx, %mm0 C carry
326 movd %edx, %mm1 C carry
327 movl %ecx, %edx C size
328 movl $31, %ecx
332 xorl %eax, %ecx C l = leading zeros on d
333 addl $1, %eax
335 shll %cl, %ebp C d normalized
336 movd %ecx, %mm7 C l
337 movl %edx, %ecx C size
339 movd %eax, %mm6 C 32-l
340 movl $-1, %edx
341 movl $-1, %eax
345 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
347 divl %ebp C floor (b*(b-d)-1 / d)
348 movd %ebp, %mm5 C d
352 movd %eax, %mm4 C m
355 L(start_preinv):
356 C eax inverse
357 C ebx xsize
358 C ecx size
359 C edx
360 C esi &src[size-1]
361 C edi &dst[size+xsize-1]
362 C ebp
364 C mm0 carry
365 C mm1 carry
366 C mm2
367 C mm4 m
368 C mm5 d
369 C mm6 31-l
370 C mm7 l
372 psllq %mm7, %mm0 C n2 = carry << l, for size==0
374 subl $1, %ecx
375 jb L(integer_none)
377 movd (%esi), %mm0 C src high limb
378 punpckldq %mm1, %mm0
379 psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
380 jz L(integer_last)
383 C The dependent chain here consists of
385 C 2 paddd n1+n2
386 C 8 pmuludq m*(n1+n2)
387 C 2 paddq n2:nadj + m*(n1+n2)
388 C 2 psrlq q1
389 C 8 pmuludq d*q1
390 C 2 psubq (n-d)-q1*d
391 C 2 psrlq high n-(q1+1)*d mask
392 C 2 pand d masked
393 C 2 paddd n2+d addback
394 C --
395 C 30
397 C But it seems to run at 32 cycles, so presumably there's something else
398 C going on.
400 ALIGN(16)
401 L(integer_top):
402 C eax
403 C ebx
404 C ecx counter, size-1 to 0
405 C edx
406 C esi src, decrementing
407 C edi dst, decrementing
409 C mm0 n2
410 C mm4 m
411 C mm5 d
412 C mm6 32-l
413 C mm7 l
415 ASSERT(b,`C n2<d
416 movd %mm0, %eax
417 movd %mm5, %edx
418 cmpl %edx, %eax')
420 movd -4(%esi), %mm1 C next src limbs
421 movd (%esi), %mm2
422 leal -4(%esi), %esi
424 punpckldq %mm2, %mm1
425 psrlq %mm6, %mm1 C n10
427 movq %mm1, %mm2 C n10
428 movq %mm1, %mm3 C n10
429 psrad $31, %mm1 C -n1
430 pand %mm5, %mm1 C -n1 & d
431 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
433 psrld $31, %mm2 C n1
434 paddd %mm0, %mm2 C n2+n1
435 punpckldq %mm0, %mm1 C n2:nadj
437 pmuludq %mm4, %mm2 C m*(n2+n1)
441 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
442 pxor %mm2, %mm2 C break dependency, saves 4 cycles
443 pcmpeqd %mm2, %mm2 C FF...FF
444 psrlq $63, %mm2 C 1
446 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
448 paddd %mm1, %mm2 C q1+1
449 pmuludq %mm5, %mm1 C q1*d
451 punpckldq %mm0, %mm3 C n = n2:n10
452 pxor %mm0, %mm0
454 psubq %mm5, %mm3 C n - d
458 psubq %mm1, %mm3 C n - (q1+1)*d
460 por %mm3, %mm0 C copy remainder -> new n2
461 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
463 ASSERT(be,`C 0 or -1
464 movd %mm3, %eax
465 addl $1, %eax
466 cmpl $1, %eax')
468 paddd %mm3, %mm2 C q
469 pand %mm5, %mm3 C mask & d
471 paddd %mm3, %mm0 C addback if necessary
472 movd %mm2, (%edi)
473 leal -4(%edi), %edi
475 subl $1, %ecx
476 ja L(integer_top)
479 L(integer_last):
480 C eax
481 C ebx xsize
482 C ecx
483 C edx
484 C esi &src[0]
485 C edi &dst[xsize]
487 C mm0 n2
488 C mm4 m
489 C mm5 d
490 C mm6
491 C mm7 l
493 ASSERT(b,`C n2<d
494 movd %mm0, %eax
495 movd %mm5, %edx
496 cmpl %edx, %eax')
498 movd (%esi), %mm1 C src[0]
499 psllq %mm7, %mm1 C n10
501 movq %mm1, %mm2 C n10
502 movq %mm1, %mm3 C n10
503 psrad $31, %mm1 C -n1
504 pand %mm5, %mm1 C -n1 & d
505 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
507 psrld $31, %mm2 C n1
508 paddd %mm0, %mm2 C n2+n1
509 punpckldq %mm0, %mm1 C n2:nadj
511 pmuludq %mm4, %mm2 C m*(n2+n1)
515 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
516 pcmpeqd %mm2, %mm2 C FF...FF
517 psrlq $63, %mm2 C 1
519 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
520 paddd %mm1, %mm2 C q1
522 pmuludq %mm5, %mm1 C q1*d
523 punpckldq %mm0, %mm3 C n
524 psubq %mm5, %mm3 C n - d
525 pxor %mm0, %mm0
529 psubq %mm1, %mm3 C n - (q1+1)*d
531 por %mm3, %mm0 C remainder -> n2
532 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
534 ASSERT(be,`C 0 or -1
535 movd %mm3, %eax
536 addl $1, %eax
537 cmpl $1, %eax')
539 paddd %mm3, %mm2 C q
540 pand %mm5, %mm3 C mask & d
542 paddd %mm3, %mm0 C addback if necessary
543 movd %mm2, (%edi)
544 leal -4(%edi), %edi
547 L(integer_none):
548 C eax
549 C ebx xsize
551 orl %ebx, %ebx
552 jnz L(fraction_some) C if xsize!=0
555 L(fraction_done):
556 movl SAVE_EBP, %ebp
557 psrld %mm7, %mm0 C remainder
559 movl SAVE_EDI, %edi
560 movd %mm0, %eax
562 movl SAVE_ESI, %esi
563 movl SAVE_EBX, %ebx
564 emms
569 C -----------------------------------------------------------------------------
572 L(fraction_some):
573 C eax
574 C ebx xsize
575 C ecx
576 C edx
577 C esi
578 C edi &dst[xsize-1]
579 C ebp
582 L(fraction_top):
583 C eax
584 C ebx counter, xsize iterations
585 C ecx
586 C edx
587 C esi src, decrementing
588 C edi dst, decrementing
590 C mm0 n2
591 C mm4 m
592 C mm5 d
593 C mm6 32-l
594 C mm7 l
596 ASSERT(b,`C n2<d
597 movd %mm0, %eax
598 movd %mm5, %edx
599 cmpl %edx, %eax')
601 movq %mm0, %mm1 C n2
602 pmuludq %mm4, %mm0 C m*n2
604 pcmpeqd %mm2, %mm2
605 psrlq $63, %mm2
609 psrlq $32, %mm0 C high(m*n2)
611 paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
613 paddd %mm0, %mm2 C q1+1
614 pmuludq %mm5, %mm0 C q1*d
616 psllq $32, %mm1 C n = n2:0
617 psubq %mm5, %mm1 C n - d
621 psubq %mm0, %mm1 C r = n - (q1+1)*d
622 pxor %mm0, %mm0
624 por %mm1, %mm0 C r -> n2
625 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
627 ASSERT(be,`C 0 or -1
628 movd %mm1, %eax
629 addl $1, %eax
630 cmpl $1, %eax')
632 paddd %mm1, %mm2 C q
633 pand %mm5, %mm1 C mask & d
635 paddd %mm1, %mm0 C addback if necessary
636 movd %mm2, (%edi)
637 leal -4(%edi), %edi
639 subl $1, %ebx
640 jne L(fraction_top)
643 jmp L(fraction_done)
645 EPILOGUE()