beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / dive_1.asm
blob458bd02539554d0d1596b751382ba9c49e8a9548
1 dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C cycles/limb
35 C Athlon: 11.0
36 C Hammer: 9.0
39 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor);
42 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
43 C achieved with no special effort. The load and shrld latencies are hidden
44 C by out of order execution.
46 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
48 defframe(PARAM_DIVISOR,16)
49 defframe(PARAM_SIZE, 12)
50 defframe(PARAM_SRC, 8)
51 defframe(PARAM_DST, 4)
53 defframe(SAVE_EBX, -4)
54 defframe(SAVE_ESI, -8)
55 defframe(SAVE_EDI, -12)
56 defframe(SAVE_EBP, -16)
57 defframe(VAR_INVERSE, -20)
58 defframe(VAR_DST_END, -24)
60 deflit(STACK_SPACE, 24)
62 TEXT
64 ALIGN(16)
65 PROLOGUE(mpn_divexact_1)
66 deflit(`FRAME',0)
68 movl PARAM_DIVISOR, %eax
69 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
70 movl $-1, %ecx C shift count
72 movl %ebp, SAVE_EBP
73 movl PARAM_SIZE, %ebp
75 movl %esi, SAVE_ESI
76 movl %edi, SAVE_EDI
78 C If there's usually only one or two trailing zero bits then this
79 C should be faster than bsfl.
80 L(strip_twos):
81 incl %ecx
82 shrl %eax
83 jnc L(strip_twos)
85 movl %ebx, SAVE_EBX
86 leal 1(%eax,%eax), %ebx C d without twos
87 andl $127, %eax C d/2, 7 bits
89 ifdef(`PIC',`
90 LEA( binvert_limb_table, %edx)
91 movzbl (%eax,%edx), %eax C inv 8 bits
92 ',`
93 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
96 leal (%eax,%eax), %edx C 2*inv
97 movl %ebx, PARAM_DIVISOR C d without twos
99 imull %eax, %eax C inv*inv
101 movl PARAM_SRC, %esi
102 movl PARAM_DST, %edi
104 imull %ebx, %eax C inv*inv*d
106 subl %eax, %edx C inv = 2*inv - inv*inv*d
107 leal (%edx,%edx), %eax C 2*inv
109 imull %edx, %edx C inv*inv
111 leal (%esi,%ebp,4), %esi C src end
112 leal (%edi,%ebp,4), %edi C dst end
113 negl %ebp C -size
115 imull %ebx, %edx C inv*inv*d
117 subl %edx, %eax C inv = 2*inv - inv*inv*d
119 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
120 pushl %eax FRAME_pushl()
121 imull PARAM_DIVISOR, %eax
122 cmpl $1, %eax
123 popl %eax FRAME_popl()')
125 movl %eax, VAR_INVERSE
126 movl (%esi,%ebp,4), %eax C src[0]
128 incl %ebp
129 jz L(one)
131 movl (%esi,%ebp,4), %edx C src[1]
133 shrdl( %cl, %edx, %eax)
135 movl %edi, VAR_DST_END
136 xorl %ebx, %ebx
137 jmp L(entry)
139 ALIGN(8)
140 L(top):
141 C eax q
142 C ebx carry bit, 0 or 1
143 C ecx shift
144 C edx
145 C esi src end
146 C edi dst end
147 C ebp counter, limbs, negative
149 mull PARAM_DIVISOR C carry limb in edx
151 movl -4(%esi,%ebp,4), %eax
152 movl (%esi,%ebp,4), %edi
154 shrdl( %cl, %edi, %eax)
156 subl %ebx, %eax C apply carry bit
157 setc %bl
158 movl VAR_DST_END, %edi
160 subl %edx, %eax C apply carry limb
161 adcl $0, %ebx
163 L(entry):
164 imull VAR_INVERSE, %eax
166 movl %eax, -4(%edi,%ebp,4)
167 incl %ebp
168 jnz L(top)
171 mull PARAM_DIVISOR C carry limb in edx
173 movl -4(%esi), %eax C src high limb
174 shrl %cl, %eax
175 movl SAVE_ESI, %esi
177 subl %ebx, %eax C apply carry bit
178 movl SAVE_EBX, %ebx
179 movl SAVE_EBP, %ebp
181 subl %edx, %eax C apply carry limb
183 imull VAR_INVERSE, %eax
185 movl %eax, -4(%edi)
186 movl SAVE_EDI, %edi
187 addl $STACK_SPACE, %esp
192 L(one):
193 shrl %cl, %eax
194 movl SAVE_ESI, %esi
195 movl SAVE_EBX, %ebx
197 imull VAR_INVERSE, %eax
199 movl SAVE_EBP, %ebp
200 movl %eax, -4(%edi)
202 movl SAVE_EDI, %edi
203 addl $STACK_SPACE, %esp
207 EPILOGUE()
208 ASM_END()