beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k6 / mmx / dive_1.asm
blob1bbad3a3185ab8294f823fe4ca1f2d1289e955d3
1 dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C divisor
35 C odd even
36 C K6: 10.0 12.0 cycles/limb
37 C K6-2: 10.0 11.5
40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
41 C mp_limb_t divisor);
43 C A simple divl is used for size==1. This is about 10 cycles faster for an
44 C odd divisor or 20 cycles for an even divisor.
46 C The loops are quite sensitive to code alignment, speeds should be
47 C rechecked (odd and even divisor, pic and non-pic) if contemplating
48 C changing anything.
50 defframe(PARAM_DIVISOR,16)
51 defframe(PARAM_SIZE, 12)
52 defframe(PARAM_SRC, 8)
53 defframe(PARAM_DST, 4)
55 dnl re-use parameter space
56 define(VAR_INVERSE,`PARAM_DST')
58 TEXT
60 ALIGN(32)
61 PROLOGUE(mpn_divexact_1)
62 deflit(`FRAME',0)
64 movl PARAM_SIZE, %ecx
66 movl PARAM_SRC, %eax
67 xorl %edx, %edx
69 cmpl $1, %ecx
70 jnz L(two_or_more)
72 movl (%eax), %eax
74 divl PARAM_DIVISOR
76 movl PARAM_DST, %ecx
77 movl %eax, (%ecx)
79 ret
82 L(two_or_more):
83 movl PARAM_DIVISOR, %eax
84 pushl %ebx FRAME_pushl()
86 movl PARAM_SRC, %ebx
87 pushl %ebp FRAME_pushl()
89 L(strip_twos):
90 shrl %eax
91 incl %edx C will get shift+1
93 jnc L(strip_twos)
94 pushl %esi FRAME_pushl()
96 leal 1(%eax,%eax), %esi C d without twos
97 andl $127, %eax C d/2, 7 bits
99 ifdef(`PIC',`
100 LEA( binvert_limb_table, %ebp)
101 Zdisp( movzbl, 0,(%eax,%ebp), %eax)
103 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
105 pushl %edi FRAME_pushl()
107 leal (%eax,%eax), %ebp C 2*inv
109 imull %eax, %eax C inv*inv
111 movl PARAM_DST, %edi
113 imull %esi, %eax C inv*inv*d
115 subl %eax, %ebp C inv = 2*inv - inv*inv*d
116 leal (%ebp,%ebp), %eax C 2*inv
118 imull %ebp, %ebp C inv*inv
120 movl %esi, PARAM_DIVISOR C d without twos
121 leal (%ebx,%ecx,4), %ebx C src end
123 imull %esi, %ebp C inv*inv*d
125 leal (%edi,%ecx,4), %edi C dst end
126 negl %ecx C -size
128 subl %ebp, %eax C inv = 2*inv - inv*inv*d
129 subl $1, %edx C shift amount, and clear carry
131 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
132 pushl %eax FRAME_pushl()
133 imull PARAM_DIVISOR, %eax
134 cmpl $1, %eax
135 popl %eax FRAME_popl()')
137 movl %eax, VAR_INVERSE
138 jnz L(even)
140 movl (%ebx,%ecx,4), %esi C src low limb
141 jmp L(odd_entry)
144 ALIGN(16)
145 nop C code alignment
146 L(odd_top):
147 C eax scratch
148 C ebx src end
149 C ecx counter, limbs, negative
150 C edx inverse
151 C esi next limb, adjusted for carry
152 C edi dst end
153 C ebp carry bit, 0 or -1
155 imull %edx, %esi
157 movl PARAM_DIVISOR, %eax
158 movl %esi, -4(%edi,%ecx,4)
160 mull %esi C carry limb in edx
162 subl %ebp, %edx C apply carry bit
163 movl (%ebx,%ecx,4), %esi
165 L(odd_entry):
166 subl %edx, %esi C apply carry limb
167 movl VAR_INVERSE, %edx
169 sbbl %ebp, %ebp C 0 or -1
171 incl %ecx
172 jnz L(odd_top)
175 imull %edx, %esi
177 movl %esi, -4(%edi,%ecx,4)
179 popl %edi
180 popl %esi
182 popl %ebp
183 popl %ebx
188 L(even):
189 C eax
190 C ebx src end
191 C ecx -size
192 C edx twos
193 C esi
194 C edi dst end
195 C ebp
197 xorl %ebp, %ebp
198 Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
200 movd %edx, %mm7
201 movl VAR_INVERSE, %edx
203 addl $2, %ecx
204 psrlq %mm7, %mm0
206 movd %mm0, %esi
207 jz L(even_two) C if only two limbs
210 C Out-of-order execution is good enough to hide the load/rshift/movd
211 C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
212 C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
213 C been found. Maybe the fact every second movq is unaligned costs the extra
214 C 0.5.
216 L(even_top):
217 C eax scratch
218 C ebx src end
219 C ecx counter, limbs, negative
220 C edx inverse
221 C esi next limb, adjusted for carry
222 C edi dst end
223 C ebp carry bit, 0 or -1
225 C mm0 scratch, source limbs
226 C mm7 twos
228 imull %edx, %esi
230 movl %esi, -8(%edi,%ecx,4)
231 movl PARAM_DIVISOR, %eax
233 mull %esi C carry limb in edx
235 movq -4(%ebx,%ecx,4), %mm0
236 psrlq %mm7, %mm0
238 movd %mm0, %esi
239 subl %ebp, %edx C apply carry bit
241 subl %edx, %esi C apply carry limb
242 movl VAR_INVERSE, %edx
244 sbbl %ebp, %ebp C 0 or -1
246 incl %ecx
247 jnz L(even_top)
250 L(even_two):
251 movd -4(%ebx), %mm0 C src high limb
252 psrlq %mm7, %mm0
254 imull %edx, %esi
256 movl %esi, -8(%edi)
257 movl PARAM_DIVISOR, %eax
259 mull %esi C carry limb in edx
261 movd %mm0, %esi
262 subl %ebp, %edx C apply carry bit
264 movl VAR_INVERSE, %eax
265 subl %edx, %esi C apply carry limb
267 imull %eax, %esi
269 movl %esi, -4(%edi)
271 popl %edi
272 popl %esi
274 popl %ebp
275 popl %ebx
277 emms_or_femms
281 EPILOGUE()
282 ASM_END()