beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mod_1_4.asm
blobbb7597edd29180cd138a427efa6813a8ba770371
1 dnl x86-32 mpn_mod_1s_4p, requiring cmov.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009, 2010 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C P5 ?
37 C P6 model 0-8,10-12 ?
38 C P6 model 9 (Banias) ?
39 C P6 model 13 (Dothan) 6
40 C P4 model 0 (Willamette) ?
41 C P4 model 1 (?) ?
42 C P4 model 2 (Northwood) 15.5
43 C P4 model 3 (Prescott) ?
44 C P4 model 4 (Nocona) ?
45 C AMD K6 ?
46 C AMD K7 4.75
47 C AMD K8 ?
49 ASM_START()
50 TEXT
51 ALIGN(16)
52 PROLOGUE(mpn_mod_1s_4p)
53 push %ebp
54 push %edi
55 push %esi
56 push %ebx
57 sub $28, %esp
58 mov 60(%esp), %edi C cps[]
59 mov 8(%edi), %eax
60 mov 12(%edi), %edx
61 mov 16(%edi), %ecx
62 mov 20(%edi), %esi
63 mov 24(%edi), %edi
64 mov %eax, 4(%esp)
65 mov %edx, 8(%esp)
66 mov %ecx, 12(%esp)
67 mov %esi, 16(%esp)
68 mov %edi, 20(%esp)
69 mov 52(%esp), %eax C n
70 xor %edi, %edi
71 mov 48(%esp), %esi C up
72 lea -12(%esi,%eax,4), %esi
73 and $3, %eax
74 je L(b0)
75 cmp $2, %eax
76 jc L(b1)
77 je L(b2)
79 L(b3): mov 4(%esi), %eax
80 mull 4(%esp)
81 mov (%esi), %ebp
82 add %eax, %ebp
83 adc %edx, %edi
84 mov 8(%esi), %eax
85 mull 8(%esp)
86 lea -12(%esi), %esi
87 jmp L(m0)
89 L(b0): mov (%esi), %eax
90 mull 4(%esp)
91 mov -4(%esi), %ebp
92 add %eax, %ebp
93 adc %edx, %edi
94 mov 4(%esi), %eax
95 mull 8(%esp)
96 add %eax, %ebp
97 adc %edx, %edi
98 mov 8(%esi), %eax
99 mull 12(%esp)
100 lea -16(%esi), %esi
101 jmp L(m0)
103 L(b1): mov 8(%esi), %ebp
104 lea -4(%esi), %esi
105 jmp L(m1)
107 L(b2): mov 8(%esi), %edi
108 mov 4(%esi), %ebp
109 lea -8(%esi), %esi
110 jmp L(m1)
112 ALIGN(16)
113 L(top): mov (%esi), %eax
114 mull 4(%esp)
115 mov -4(%esi), %ebx
116 xor %ecx, %ecx
117 add %eax, %ebx
118 adc %edx, %ecx
119 mov 4(%esi), %eax
120 mull 8(%esp)
121 add %eax, %ebx
122 adc %edx, %ecx
123 mov 8(%esi), %eax
124 mull 12(%esp)
125 add %eax, %ebx
126 adc %edx, %ecx
127 lea -16(%esi), %esi
128 mov 16(%esp), %eax
129 mul %ebp
130 add %eax, %ebx
131 adc %edx, %ecx
132 mov 20(%esp), %eax
133 mul %edi
134 mov %ebx, %ebp
135 mov %ecx, %edi
136 L(m0): add %eax, %ebp
137 adc %edx, %edi
138 L(m1): subl $4, 52(%esp)
139 ja L(top)
141 L(end): mov 4(%esp), %eax
142 mul %edi
143 mov 60(%esp), %edi
144 add %eax, %ebp
145 adc $0, %edx
146 mov 4(%edi), %ecx
147 mov %edx, %esi
148 mov %ebp, %eax
149 sal %cl, %esi
150 mov %ecx, %ebx
151 neg %ecx
152 shr %cl, %eax
153 or %esi, %eax
154 lea 1(%eax), %esi
155 mull (%edi)
156 mov %ebx, %ecx
157 mov %eax, %ebx
158 mov %ebp, %eax
159 mov 56(%esp), %ebp
160 sal %cl, %eax
161 add %eax, %ebx
162 adc %esi, %edx
163 imul %ebp, %edx
164 sub %edx, %eax
165 lea (%eax,%ebp), %edx
166 cmp %eax, %ebx
167 cmovc( %edx, %eax)
168 mov %eax, %edx
169 sub %ebp, %eax
170 cmovc( %edx, %eax)
171 add $28, %esp
172 pop %ebx
173 pop %esi
174 pop %edi
175 pop %ebp
176 shr %cl, %eax
178 EPILOGUE()
180 ALIGN(16)
181 PROLOGUE(mpn_mod_1s_4p_cps)
182 C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
183 push %ebp
184 push %edi
185 push %esi
186 push %ebx
187 mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
188 mov 24(%esp), %ebx
189 bsr %ebx, %ecx
190 xor $31, %ecx
191 sal %cl, %ebx C b << cnt
192 mov %ebx, %edx
193 not %edx
194 mov $-1, %eax
195 div %ebx
196 xor %edi, %edi
197 sub %ebx, %edi
198 mov $1, %esi
199 mov %eax, (%ebp) C store bi
200 mov %ecx, 4(%ebp) C store cnt
201 shld %cl, %eax, %esi
202 imul %edi, %esi
203 mov %eax, %edi
204 mul %esi
206 add %esi, %edx
207 shr %cl, %esi
208 mov %esi, 8(%ebp) C store B1modb
210 not %edx
211 imul %ebx, %edx
212 lea (%edx,%ebx), %esi
213 cmp %edx, %eax
214 cmovnc( %edx, %esi)
215 mov %edi, %eax
216 mul %esi
218 add %esi, %edx
219 shr %cl, %esi
220 mov %esi, 12(%ebp) C store B2modb
222 not %edx
223 imul %ebx, %edx
224 lea (%edx,%ebx), %esi
225 cmp %edx, %eax
226 cmovnc( %edx, %esi)
227 mov %edi, %eax
228 mul %esi
230 add %esi, %edx
231 shr %cl, %esi
232 mov %esi, 16(%ebp) C store B3modb
234 not %edx
235 imul %ebx, %edx
236 lea (%edx,%ebx), %esi
237 cmp %edx, %eax
238 cmovnc( %edx, %esi)
239 mov %edi, %eax
240 mul %esi
242 add %esi, %edx
243 shr %cl, %esi
244 mov %esi, 20(%ebp) C store B4modb
246 not %edx
247 imul %ebx, %edx
248 add %edx, %ebx
249 cmp %edx, %eax
250 cmovnc( %edx, %ebx)
252 shr %cl, %ebx
253 mov %ebx, 24(%ebp) C store B5modb
255 pop %ebx
256 pop %esi
257 pop %edi
258 pop %ebp
260 EPILOGUE()