beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / rsh1aors_n.asm
blob5528ce47dae58c7a0babfb1a9a99c49a5e9e71c5
1 dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
36 C cycles/limb
37 C AMD K8,K9 4.13
38 C AMD K10 4.13
39 C Intel P4 5.70
40 C Intel core2 4.75
41 C Intel corei 5
42 C Intel atom 8.75
43 C VIA nano 5.25
45 C TODO
46 C * Try to make this smaller, 746 bytes seem excessive for this 2nd class
47 C function. Less sw pipelining would help, and since we now probably
48 C pipeline somewhat too deeply, it might not affect performance too much.
49 C * A separate small-n loop might speed things as well as make things smaller.
50 C That loop should be selected before pushing registers.
52 C INPUT PARAMETERS
53 define(`rp', `%rdi')
54 define(`up', `%rsi')
55 define(`vp', `%rdx')
56 define(`n', `%rcx')
57 define(`cy', `%r8')
59 ifdef(`OPERATION_rsh1add_n', `
60 define(ADDSUB, add)
61 define(func, mpn_rsh1add_n)
62 define(func_nc, mpn_rsh1add_nc)')
63 ifdef(`OPERATION_rsh1sub_n', `
64 define(ADDSUB, sub)
65 define(func, mpn_rsh1sub_n)
66 define(func_nc, mpn_rsh1sub_nc)')
68 ABI_SUPPORT(DOS64)
69 ABI_SUPPORT(STD64)
71 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
73 ASM_START()
74 TEXT
75 PROLOGUE(func)
76 FUNC_ENTRY(4)
77 xor %r8, %r8
78 IFDOS(` jmp L(ent) ')
79 EPILOGUE()
80 PROLOGUE(func_nc)
81 FUNC_ENTRY(4)
82 IFDOS(` mov 56(%rsp), %r8 ')
83 L(ent): push %rbx
84 push %r12
85 push %r13
86 push %r14
87 push %r15
89 mov (vp), %r9
90 mov (up), %r15
92 mov R32(n), R32(%rax)
93 and $3, R32(%rax)
94 jne L(n00)
96 mov R32(%r8), R32(%rbx) C n = 0, 4, 8, ...
97 mov 8(up), %r10
98 ADDSUB %r9, %r15
99 mov 8(vp), %r9
100 setc R8(%rax)
101 ADDSUB %rbx, %r15 C return bit
102 jnc 1f
103 mov $1, R8(%rax)
104 1: mov 16(up), %r12
105 ADDSUB %r9, %r10
106 mov 16(vp), %r9
107 setc R8(%rbx)
108 mov %r15, %r13
109 ADDSUB %rax, %r10
110 jnc 1f
111 mov $1, R8(%rbx)
112 1: mov 24(up), %r11
113 ADDSUB %r9, %r12
114 lea 32(up), up
115 mov 24(vp), %r9
116 lea 32(vp), vp
117 setc R8(%rax)
118 mov %r10, %r14
119 shl $63, %r10
120 shr %r13
121 jmp L(L00)
123 L(n00): cmp $2, R32(%rax)
124 jnc L(n01)
125 xor R32(%rbx), R32(%rbx) C n = 1, 5, 9, ...
126 lea -24(rp), rp
127 mov R32(%r8), R32(%rax)
128 dec n
129 jnz L(gt1)
130 ADDSUB %r9, %r15
131 setc R8(%rbx)
132 ADDSUB %rax, %r15
133 jnc 1f
134 mov $1, R8(%rbx)
135 1: mov %r15, %r14
136 shl $63, %rbx
137 shr %r14
138 jmp L(cj1)
139 L(gt1): mov 8(up), %r8
140 ADDSUB %r9, %r15
141 mov 8(vp), %r9
142 setc R8(%rbx)
143 ADDSUB %rax, %r15
144 jnc 1f
145 mov $1, R8(%rbx)
146 1: mov 16(up), %r10
147 ADDSUB %r9, %r8
148 mov 16(vp), %r9
149 setc R8(%rax)
150 mov %r15, %r14
151 ADDSUB %rbx, %r8
152 jnc 1f
153 mov $1, R8(%rax)
154 1: mov 24(up), %r12
155 ADDSUB %r9, %r10
156 mov 24(vp), %r9
157 setc R8(%rbx)
158 mov %r8, %r13
159 shl $63, %r8
160 shr %r14
161 lea 8(up), up
162 lea 8(vp), vp
163 jmp L(L01)
165 L(n01): jne L(n10)
166 lea -16(rp), rp C n = 2, 6, 10, ...
167 mov R32(%r8), R32(%rbx)
168 mov 8(up), %r11
169 ADDSUB %r9, %r15
170 mov 8(vp), %r9
171 setc R8(%rax)
172 ADDSUB %rbx, %r15
173 jnc 1f
174 mov $1, R8(%rax)
175 1: sub $2, n
176 jnz L(gt2)
177 ADDSUB %r9, %r11
178 setc R8(%rbx)
179 mov %r15, %r13
180 ADDSUB %rax, %r11
181 jnc 1f
182 mov $1, R8(%rbx)
183 1: mov %r11, %r14
184 shl $63, %r11
185 shr %r13
186 jmp L(cj2)
187 L(gt2): mov 16(up), %r8
188 ADDSUB %r9, %r11
189 mov 16(vp), %r9
190 setc R8(%rbx)
191 mov %r15, %r13
192 ADDSUB %rax, %r11
193 jnc 1f
194 mov $1, R8(%rbx)
195 1: mov 24(up), %r10
196 ADDSUB %r9, %r8
197 mov 24(vp), %r9
198 setc R8(%rax)
199 mov %r11, %r14
200 shl $63, %r11
201 shr %r13
202 lea 16(up), up
203 lea 16(vp), vp
204 jmp L(L10)
206 L(n10): xor R32(%rbx), R32(%rbx) C n = 3, 7, 11, ...
207 lea -8(rp), rp
208 mov R32(%r8), R32(%rax)
209 mov 8(up), %r12
210 ADDSUB %r9, %r15
211 mov 8(vp), %r9
212 setc R8(%rbx)
213 ADDSUB %rax, %r15
214 jnc 1f
215 mov $1, R8(%rbx)
216 1: mov 16(up), %r11
217 ADDSUB %r9, %r12
218 mov 16(vp), %r9
219 setc R8(%rax)
220 mov %r15, %r14
221 ADDSUB %rbx, %r12
222 jnc 1f
223 mov $1, R8(%rax)
224 1: sub $3, n
225 jnz L(gt3)
226 ADDSUB %r9, %r11
227 setc R8(%rbx)
228 mov %r12, %r13
229 shl $63, %r12
230 shr %r14
231 jmp L(cj3)
232 L(gt3): mov 24(up), %r8
233 ADDSUB %r9, %r11
234 mov 24(vp), %r9
235 setc R8(%rbx)
236 mov %r12, %r13
237 shl $63, %r12
238 shr %r14
239 lea 24(up), up
240 lea 24(vp), vp
241 jmp L(L11)
243 L(c0): mov $1, R8(%rbx)
244 jmp L(rc0)
245 L(c1): mov $1, R8(%rax)
246 jmp L(rc1)
247 L(c2): mov $1, R8(%rbx)
248 jmp L(rc2)
250 ALIGN(16)
251 L(top): mov (up), %r8 C not on critical path
252 or %r13, %r10
253 ADDSUB %r9, %r11 C not on critical path
254 mov (vp), %r9 C not on critical path
255 setc R8(%rbx) C save carry out
256 mov %r12, %r13 C new for later
257 shl $63, %r12 C shift new right
258 shr %r14 C shift old left
259 mov %r10, (rp)
260 L(L11): ADDSUB %rax, %r11 C apply previous carry out
261 jc L(c0) C jump if ripple
262 L(rc0): mov 8(up), %r10
263 or %r14, %r12
264 ADDSUB %r9, %r8
265 mov 8(vp), %r9
266 setc R8(%rax)
267 mov %r11, %r14
268 shl $63, %r11
269 shr %r13
270 mov %r12, 8(rp)
271 L(L10): ADDSUB %rbx, %r8
272 jc L(c1)
273 L(rc1): mov 16(up), %r12
274 or %r13, %r11
275 ADDSUB %r9, %r10
276 mov 16(vp), %r9
277 setc R8(%rbx)
278 mov %r8, %r13
279 shl $63, %r8
280 shr %r14
281 mov %r11, 16(rp)
282 L(L01): ADDSUB %rax, %r10
283 jc L(c2)
284 L(rc2): mov 24(up), %r11
285 or %r14, %r8
286 ADDSUB %r9, %r12
287 lea 32(up), up
288 mov 24(vp), %r9
289 lea 32(vp), vp
290 setc R8(%rax)
291 mov %r10, %r14
292 shl $63, %r10
293 shr %r13
294 mov %r8, 24(rp)
295 lea 32(rp), rp
296 L(L00): ADDSUB %rbx, %r12
297 jc L(c3)
298 L(rc3): sub $4, n
299 ja L(top)
301 L(end): or %r13, %r10
302 ADDSUB %r9, %r11
303 setc R8(%rbx)
304 mov %r12, %r13
305 shl $63, %r12
306 shr %r14
307 mov %r10, (rp)
308 L(cj3): ADDSUB %rax, %r11
309 jnc 1f
310 mov $1, R8(%rbx)
311 1: or %r14, %r12
312 mov %r11, %r14
313 shl $63, %r11
314 shr %r13
315 mov %r12, 8(rp)
316 L(cj2): or %r13, %r11
317 shl $63, %rbx
318 shr %r14
319 mov %r11, 16(rp)
320 L(cj1): or %r14, %rbx
321 mov %rbx, 24(rp)
323 mov R32(%r15), R32(%rax)
324 and $1, R32(%rax)
325 pop %r15
326 pop %r14
327 pop %r13
328 pop %r12
329 pop %rbx
330 FUNC_EXIT()
332 L(c3): mov $1, R8(%rax)
333 jmp L(rc3)
334 EPILOGUE()