beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / bdiv_q_1.asm
blob2af7bb9eb303b9b655ecdebe8599dfc1db806ae7
1 dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
3 dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
5 dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb
37 C Athlon: 11.0
38 C Hammer: 9.0
41 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
42 C mp_limb_t divisor);
44 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
45 C achieved with no special effort. The load and shrld latencies are hidden
46 C by out of order execution.
48 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
50 defframe(PARAM_SHIFT, 24)
51 defframe(PARAM_INVERSE,20)
52 defframe(PARAM_DIVISOR,16)
53 defframe(PARAM_SIZE, 12)
54 defframe(PARAM_SRC, 8)
55 defframe(PARAM_DST, 4)
57 defframe(SAVE_EBX, -4)
58 defframe(SAVE_ESI, -8)
59 defframe(SAVE_EDI, -12)
60 defframe(SAVE_EBP, -16)
61 defframe(VAR_INVERSE, -20)
62 defframe(VAR_DST_END, -24)
64 deflit(STACK_SPACE, 24)
66 TEXT
68 C mp_limb_t
69 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
70 C mp_limb_t inverse, int shift)
71 ALIGN(16)
72 PROLOGUE(mpn_pi1_bdiv_q_1)
73 deflit(`FRAME',0)
75 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
76 movl PARAM_SHIFT, %ecx C shift count
78 movl %ebp, SAVE_EBP
79 movl PARAM_SIZE, %ebp
81 movl %esi, SAVE_ESI
82 movl PARAM_SRC, %esi
84 movl %edi, SAVE_EDI
85 movl PARAM_DST, %edi
87 movl %ebx, SAVE_EBX
89 leal (%esi,%ebp,4), %esi C src end
90 leal (%edi,%ebp,4), %edi C dst end
91 negl %ebp C -size
93 movl PARAM_INVERSE, %eax C inv
95 L(common):
96 movl %eax, VAR_INVERSE
97 movl (%esi,%ebp,4), %eax C src[0]
99 incl %ebp
100 jz L(one)
102 movl (%esi,%ebp,4), %edx C src[1]
104 shrdl( %cl, %edx, %eax)
106 movl %edi, VAR_DST_END
107 xorl %ebx, %ebx
108 jmp L(entry)
110 ALIGN(8)
111 L(top):
112 C eax q
113 C ebx carry bit, 0 or 1
114 C ecx shift
115 C edx
116 C esi src end
117 C edi dst end
118 C ebp counter, limbs, negative
120 mull PARAM_DIVISOR C carry limb in edx
122 movl -4(%esi,%ebp,4), %eax
123 movl (%esi,%ebp,4), %edi
125 shrdl( %cl, %edi, %eax)
127 subl %ebx, %eax C apply carry bit
128 setc %bl
129 movl VAR_DST_END, %edi
131 subl %edx, %eax C apply carry limb
132 adcl $0, %ebx
134 L(entry):
135 imull VAR_INVERSE, %eax
137 movl %eax, -4(%edi,%ebp,4)
138 incl %ebp
139 jnz L(top)
142 mull PARAM_DIVISOR C carry limb in edx
144 movl -4(%esi), %eax C src high limb
145 shrl %cl, %eax
146 movl SAVE_ESI, %esi
148 subl %ebx, %eax C apply carry bit
149 movl SAVE_EBX, %ebx
150 movl SAVE_EBP, %ebp
152 subl %edx, %eax C apply carry limb
154 imull VAR_INVERSE, %eax
156 movl %eax, -4(%edi)
157 movl SAVE_EDI, %edi
158 addl $STACK_SPACE, %esp
162 L(one):
163 shrl %cl, %eax
164 movl SAVE_ESI, %esi
165 movl SAVE_EBX, %ebx
167 imull VAR_INVERSE, %eax
169 movl SAVE_EBP, %ebp
171 movl %eax, -4(%edi)
172 movl SAVE_EDI, %edi
173 addl $STACK_SPACE, %esp
176 EPILOGUE()
178 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
179 C mp_limb_t divisor);
182 ALIGN(16)
183 PROLOGUE(mpn_bdiv_q_1)
184 deflit(`FRAME',0)
186 movl PARAM_DIVISOR, %eax
187 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
188 movl $-1, %ecx C shift count
190 movl %ebp, SAVE_EBP
191 movl PARAM_SIZE, %ebp
193 movl %esi, SAVE_ESI
194 movl %edi, SAVE_EDI
196 C If there's usually only one or two trailing zero bits then this
197 C should be faster than bsfl.
198 L(strip_twos):
199 incl %ecx
200 shrl %eax
201 jnc L(strip_twos)
203 movl %ebx, SAVE_EBX
204 leal 1(%eax,%eax), %ebx C d without twos
205 andl $127, %eax C d/2, 7 bits
207 ifdef(`PIC',`
208 LEA( binvert_limb_table, %edx)
209 movzbl (%eax,%edx), %eax C inv 8 bits
211 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
214 leal (%eax,%eax), %edx C 2*inv
215 movl %ebx, PARAM_DIVISOR C d without twos
217 imull %eax, %eax C inv*inv
219 movl PARAM_SRC, %esi
220 movl PARAM_DST, %edi
222 imull %ebx, %eax C inv*inv*d
224 subl %eax, %edx C inv = 2*inv - inv*inv*d
225 leal (%edx,%edx), %eax C 2*inv
227 imull %edx, %edx C inv*inv
229 leal (%esi,%ebp,4), %esi C src end
230 leal (%edi,%ebp,4), %edi C dst end
231 negl %ebp C -size
233 imull %ebx, %edx C inv*inv*d
235 subl %edx, %eax C inv = 2*inv - inv*inv*d
237 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
238 pushl %eax FRAME_pushl()
239 imull PARAM_DIVISOR, %eax
240 cmpl $1, %eax
241 popl %eax FRAME_popl()')
243 jmp L(common)
244 EPILOGUE()
245 ASM_END()