new beta-0.90.0
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mod_1_1.asm
blob1bbe6f92d767690a9324fe59ae37454a5ed14107
1 dnl x86-32 mpn_mod_1_1p, requiring cmov.
3 dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
5 dnl Copyright 2010, 2011 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C P5 ?
37 C P6 model 0-8,10-12 ?
38 C P6 model 9 (Banias) ?
39 C P6 model 13 (Dothan) ?
40 C P4 model 0 (Willamette) ?
41 C P4 model 1 (?) ?
42 C P4 model 2 (Northwood) ?
43 C P4 model 3 (Prescott) ?
44 C P4 model 4 (Nocona) ?
45 C AMD K6 ?
46 C AMD K7 7
47 C AMD K8 ?
49 define(`B2mb', `%ebx')
50 define(`r0', `%esi')
51 define(`r2', `%ebp')
52 define(`t0', `%edi')
53 define(`ap', `%ecx') C Also shift count
55 C Stack frame
56 C pre 36(%esp)
57 C b 32(%esp)
58 C n 28(%esp)
59 C ap 24(%esp)
60 C return 20(%esp)
61 C %ebp 16(%esp)
62 C %edi 12(%esp)
63 C %esi 8(%esp)
64 C %ebx 4(%esp)
65 C B2mod (%esp)
67 define(`B2modb', `(%esp)')
68 define(`n', `28(%esp)')
69 define(`b', `32(%esp)')
70 define(`pre', `36(%esp)')
72 C mp_limb_t
73 C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
75 C The pre array contains bi, cnt, B1modb, B2modb
76 C Note: This implementation needs B1modb only when cnt > 0
78 ASM_START()
79 TEXT
80 ALIGN(8)
81 PROLOGUE(mpn_mod_1_1p)
82 push %ebp
83 push %edi
84 push %esi
85 push %ebx
86 mov 32(%esp), %ebp C pre[]
88 mov 12(%ebp), %eax C B2modb
89 push %eax C Put it on stack
91 mov n, %edx
92 mov 24(%esp), ap
94 lea (ap, %edx, 4), ap
95 mov -4(ap), %eax
96 cmp $3, %edx
97 jnc L(first)
98 mov -8(ap), r0
99 jmp L(reduce_two)
101 L(first):
102 C First iteration, no r2
103 mull B2modb
104 mov -12(ap), r0
105 add %eax, r0
106 mov -8(ap), %eax
107 adc %edx, %eax
108 sbb r2, r2
109 subl $3, n
110 lea -16(ap), ap
111 jz L(reduce_three)
113 mov B2modb, B2mb
114 sub b, B2mb
115 lea (B2mb, r0), t0
116 jmp L(mid)
118 ALIGN(16)
119 L(top): C Loopmixed to 7 c/l on k7
120 add %eax, r0
121 lea (B2mb, r0), t0
122 mov r2, %eax
123 adc %edx, %eax
124 sbb r2, r2
125 L(mid): mull B2modb
126 and B2modb, r2
127 add r0, r2
128 decl n
129 mov (ap), r0
130 cmovc( t0, r2)
131 lea -4(ap), ap
132 jnz L(top)
134 add %eax, r0
135 mov r2, %eax
136 adc %edx, %eax
137 sbb r2, r2
139 L(reduce_three):
140 C Eliminate r2
141 and b, r2
142 sub r2, %eax
144 L(reduce_two):
145 mov pre, %ebp
146 movb 4(%ebp), %cl
147 test %cl, %cl
148 jz L(normalized)
150 C Unnormalized, use B1modb to reduce to size < B b
151 mull 8(%ebp)
152 xor t0, t0
153 add %eax, r0
154 adc %edx, t0
155 mov t0, %eax
157 C Left-shift to normalize
158 shld %cl, r0, %eax C Always use shld?
160 shl %cl, r0
161 jmp L(udiv)
163 L(normalized):
164 mov %eax, t0
165 sub b, t0
166 cmovnc( t0, %eax)
168 L(udiv):
169 lea 1(%eax), t0
170 mull (%ebp)
171 mov b, %ebx C Needed in register for lea
172 add r0, %eax
173 adc t0, %edx
174 imul %ebx, %edx
175 sub %edx, r0
176 cmp r0, %eax
177 lea (%ebx, r0), %eax
178 cmovnc( r0, %eax)
179 cmp %ebx, %eax
180 jnc L(fix)
181 L(ok): shr %cl, %eax
183 add $4, %esp
184 pop %ebx
185 pop %esi
186 pop %edi
187 pop %ebp
190 L(fix): sub %ebx, %eax
191 jmp L(ok)
192 EPILOGUE()
194 PROLOGUE(mpn_mod_1_1p_cps)
195 push %ebp
196 mov 12(%esp), %ebp
197 push %esi
198 bsr %ebp, %ecx
199 push %ebx
200 xor $31, %ecx
201 mov 16(%esp), %esi
202 sal %cl, %ebp
203 mov %ebp, %edx
204 not %edx
205 mov $-1, %eax
206 div %ebp C On K7, invert_limb would be a few cycles faster.
207 mov %eax, (%esi) C store bi
208 mov %ecx, 4(%esi) C store cnt
209 neg %ebp
210 mov $1, %edx
211 shld %cl, %eax, %edx
212 imul %ebp, %edx
213 shr %cl, %edx
214 imul %ebp, %eax
215 mov %edx, 8(%esi) C store B1modb
216 mov %eax, 12(%esi) C store B2modb
217 pop %ebx
218 pop %esi
219 pop %ebp
221 EPILOGUE()