beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / pentium4 / aorslshC_n.asm
blobd03c6a3f305e72a4d741f164321f2ba4339c2398
1 dnl AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
2 dnl C is 1, 2, 3. Optimized for Pentium 4.
4 dnl Contributed to the GNU project by Torbjorn Granlund.
6 dnl Copyright 2008, 2010-2012 Free Software Foundation, Inc.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 C cycles/limb
35 C AMD K8,K9 3.8
36 C AMD K10 3.8
37 C Intel P4 5.8
38 C Intel core2 4.75
39 C Intel corei 4.75
40 C Intel atom ?
41 C VIA nano 4.75
44 C INPUT PARAMETERS
45 define(`rp',`%rdi')
46 define(`up',`%rsi')
47 define(`vp',`%rdx')
48 define(`n', `%rcx')
50 define(M, eval(m4_lshift(1,LSH)))
52 ABI_SUPPORT(DOS64)
53 ABI_SUPPORT(STD64)
55 ASM_START()
56 TEXT
57 ALIGN(16)
58 PROLOGUE(func)
59 FUNC_ENTRY(4)
60 push %rbx
61 push %r12
62 push %rbp
64 mov (vp), %r9
65 shl $LSH, %r9
66 mov 4(vp), R32(%rbp)
68 xor R32(%rbx), R32(%rbx)
70 mov R32(n), R32(%rax)
71 and $3, R32(%rax)
72 jne L(n00) C n = 0, 4, 8, ...
74 mov (up), %r8
75 mov 8(up), %r10
76 shr $RSH, R32(%rbp)
77 ADDSUB %r9, %r8
78 mov 8(vp), %r9
79 lea (%rbp,%r9,M), %r9
80 setc R8(%rax)
81 mov 12(vp), R32(%rbp)
82 lea -16(rp), rp
83 jmp L(L00)
85 L(n00): cmp $2, R32(%rax)
86 jnc L(n01) C n = 1, 5, 9, ...
87 mov (up), %r11
88 lea -8(rp), rp
89 shr $RSH, R32(%rbp)
90 ADDSUB %r9, %r11
91 setc R8(%rbx)
92 dec n
93 jz L(1) C jump for n = 1
94 mov 8(up), %r8
95 mov 8(vp), %r9
96 lea (%rbp,%r9,M), %r9
97 mov 12(vp), R32(%rbp)
98 lea 8(up), up
99 lea 8(vp), vp
100 jmp L(L01)
102 L(n01): jne L(n10) C n = 2, 6, 10, ...
103 mov (up), %r12
104 mov 8(up), %r11
105 shr $RSH, R32(%rbp)
106 ADDSUB %r9, %r12
107 mov 8(vp), %r9
108 lea (%rbp,%r9,M), %r9
109 setc R8(%rax)
110 mov 12(vp), R32(%rbp)
111 lea 16(up), up
112 lea 16(vp), vp
113 jmp L(L10)
115 L(n10): mov (up), %r10
116 mov 8(up), %r12
117 shr $RSH, R32(%rbp)
118 ADDSUB %r9, %r10
119 mov 8(vp), %r9
120 lea (%rbp,%r9,M), %r9
121 setc R8(%rbx)
122 mov 12(vp), R32(%rbp)
123 lea -24(rp), rp
124 lea -8(up), up
125 lea -8(vp), vp
126 jmp L(L11)
128 L(c0): mov $1, R8(%rbx)
129 jmp L(rc0)
130 L(c1): mov $1, R8(%rax)
131 jmp L(rc1)
132 L(c2): mov $1, R8(%rbx)
133 jmp L(rc2)
135 ALIGN(16)
136 L(top): mov (up), %r8 C not on critical path
137 shr $RSH, R32(%rbp)
138 ADDSUB %r9, %r11 C not on critical path
139 mov (vp), %r9
140 lea (%rbp,%r9,M), %r9
141 setc R8(%rbx) C save carry out
142 mov 4(vp), R32(%rbp)
143 mov %r12, (rp)
144 ADDSUB %rax, %r11 C apply previous carry out
145 jc L(c0) C jump if ripple
146 L(rc0):
147 L(L01): mov 8(up), %r10
148 shr $RSH, R32(%rbp)
149 ADDSUB %r9, %r8
150 mov 8(vp), %r9
151 lea (%rbp,%r9,M), %r9
152 setc R8(%rax)
153 mov 12(vp), R32(%rbp)
154 mov %r11, 8(rp)
155 ADDSUB %rbx, %r8
156 jc L(c1)
157 L(rc1):
158 L(L00): mov 16(up), %r12
159 shr $RSH, R32(%rbp)
160 ADDSUB %r9, %r10
161 mov 16(vp), %r9
162 lea (%rbp,%r9,M), %r9
163 setc R8(%rbx)
164 mov 20(vp), R32(%rbp)
165 mov %r8, 16(rp)
166 ADDSUB %rax, %r10
167 jc L(c2)
168 L(rc2):
169 L(L11): mov 24(up), %r11
170 shr $RSH, R32(%rbp)
171 ADDSUB %r9, %r12
172 mov 24(vp), %r9
173 lea (%rbp,%r9,M), %r9
174 lea 32(up), up
175 lea 32(vp), vp
176 setc R8(%rax)
177 mov -4(vp), R32(%rbp)
178 mov %r10, 24(rp)
179 ADDSUB %rbx, %r12
180 jc L(c3)
181 L(rc3): lea 32(rp), rp
182 L(L10): sub $4, n
183 ja L(top)
185 L(end):
186 shr $RSH, R32(%rbp)
187 ADDSUB %r9, %r11
188 setc R8(%rbx)
189 mov %r12, (rp)
190 ADDSUB %rax, %r11
191 jnc L(1)
192 mov $1, R8(%rbx)
193 L(1): mov %r11, 8(rp)
194 lea (%rbx,%rbp), R32(%rax)
195 pop %rbp
196 pop %r12
197 pop %rbx
198 FUNC_EXIT()
200 L(c3): mov $1, R8(%rax)
201 jmp L(rc3)
202 EPILOGUE()
203 ASM_END()