beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / p6 / dive_1.asm
blob7d61a184e930e25c0b20744193b4cc1f143f673c
1 dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C odd even divisor
35 C P6: 10.0 12.0 cycles/limb
38 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
39 C mp_limb_t divisor);
41 C The odd case is basically the same as mpn_modexact_1_odd, just with an
42 C extra store, and it runs at the same 10 cycles which is the dependent
43 C chain.
45 C The shifts for the even case aren't on the dependent chain so in principle
46 C it could run the same too, but nothing running at 10 has been found.
47 C Perhaps there's too many uops (an extra 4 over the odd case).
49 defframe(PARAM_DIVISOR,16)
50 defframe(PARAM_SIZE, 12)
51 defframe(PARAM_SRC, 8)
52 defframe(PARAM_DST, 4)
54 defframe(SAVE_EBX, -4)
55 defframe(SAVE_ESI, -8)
56 defframe(SAVE_EDI, -12)
57 defframe(SAVE_EBP, -16)
58 defframe(VAR_INVERSE, -20)
59 deflit(STACK_SPACE, 20)
61 TEXT
63 ALIGN(16)
64 PROLOGUE(mpn_divexact_1)
65 deflit(`FRAME',0)
67 movl PARAM_DIVISOR, %eax
68 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
70 movl %esi, SAVE_ESI
71 movl PARAM_SRC, %esi
73 movl %ebx, SAVE_EBX
74 movl PARAM_SIZE, %ebx
76 bsfl %eax, %ecx C trailing twos
78 movl %ebp, SAVE_EBP
80 shrl %cl, %eax C d without twos
82 movl %eax, %edx
83 shrl %eax C d/2 without twos
85 movl %edx, PARAM_DIVISOR
86 andl $127, %eax
88 ifdef(`PIC',`
89 LEA( binvert_limb_table, %ebp)
90 movzbl (%eax,%ebp), %ebp C inv 8 bits
91 ',`
92 movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
95 leal (%ebp,%ebp), %eax C 2*inv
97 imull %ebp, %ebp C inv*inv
99 movl %edi, SAVE_EDI
100 movl PARAM_DST, %edi
102 leal (%esi,%ebx,4), %esi C src end
104 imull PARAM_DIVISOR, %ebp C inv*inv*d
106 subl %ebp, %eax C inv = 2*inv - inv*inv*d
107 leal (%eax,%eax), %ebp C 2*inv
109 imull %eax, %eax C inv*inv
111 leal (%edi,%ebx,4), %edi C dst end
112 negl %ebx C -size
114 movl %edi, PARAM_DST
116 imull PARAM_DIVISOR, %eax C inv*inv*d
118 subl %eax, %ebp C inv = 2*inv - inv*inv*d
120 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
121 movl PARAM_DIVISOR, %eax
122 imull %ebp, %eax
123 cmpl $1, %eax')
125 movl %ebp, VAR_INVERSE
126 movl (%esi,%ebx,4), %eax C src[0]
128 orl %ecx, %ecx
129 jnz L(even)
131 C ecx initial carry is zero
132 jmp L(odd_entry)
135 C The dependent chain here is
137 C subl %edx, %eax 1
138 C imull %ebp, %eax 4
139 C mull PARAM_DIVISOR 5
140 C ----
141 C total 10
143 C and this is the measured speed. No special scheduling is necessary, out
144 C of order execution hides the load latency.
146 L(odd_top):
147 C eax scratch (src limb)
148 C ebx counter, limbs, negative
149 C ecx carry bit
150 C edx carry limb, high of last product
151 C esi &src[size]
152 C edi &dst[size]
153 C ebp
155 mull PARAM_DIVISOR
157 movl (%esi,%ebx,4), %eax
158 subl %ecx, %eax
160 sbbl %ecx, %ecx
161 subl %edx, %eax
163 sbbl $0, %ecx
165 L(odd_entry):
166 imull VAR_INVERSE, %eax
168 movl %eax, (%edi,%ebx,4)
169 negl %ecx
171 incl %ebx
172 jnz L(odd_top)
175 movl SAVE_ESI, %esi
177 movl SAVE_EDI, %edi
179 movl SAVE_EBP, %ebp
181 movl SAVE_EBX, %ebx
182 addl $STACK_SPACE, %esp
187 L(even):
188 C eax src[0]
189 C ebx counter, limbs, negative
190 C ecx shift
191 C edx
192 C esi
193 C edi
194 C ebp
196 xorl %ebp, %ebp C initial carry bit
197 xorl %edx, %edx C initial carry limb (for size==1)
199 incl %ebx
200 jz L(even_one)
202 movl (%esi,%ebx,4), %edi C src[1]
204 shrdl( %cl, %edi, %eax)
206 jmp L(even_entry)
209 L(even_top):
210 C eax scratch
211 C ebx counter, limbs, negative
212 C ecx shift
213 C edx scratch
214 C esi &src[size]
215 C edi &dst[size] and scratch
216 C ebp carry bit
218 movl (%esi,%ebx,4), %edi
220 mull PARAM_DIVISOR
222 movl -4(%esi,%ebx,4), %eax
223 shrdl( %cl, %edi, %eax)
225 subl %ebp, %eax
227 sbbl %ebp, %ebp
228 subl %edx, %eax
230 sbbl $0, %ebp
232 L(even_entry):
233 imull VAR_INVERSE, %eax
235 movl PARAM_DST, %edi
236 negl %ebp
238 movl %eax, -4(%edi,%ebx,4)
239 incl %ebx
240 jnz L(even_top)
244 mull PARAM_DIVISOR
246 movl -4(%esi), %eax
248 L(even_one):
249 shrl %cl, %eax
250 movl SAVE_ESI, %esi
252 subl %ebp, %eax
253 movl SAVE_EBP, %ebp
255 subl %edx, %eax
256 movl SAVE_EBX, %ebx
258 imull VAR_INVERSE, %eax
260 movl %eax, -4(%edi)
261 movl SAVE_EDI, %edi
262 addl $STACK_SPACE, %esp
266 EPILOGUE()
267 ASM_END()