beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mod_1_4.asm
blobae34617c7f2d48897056cfbcac857e35b47e5ccd
1 dnl AMD64 mpn_mod_1s_4p
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 3
37 C AMD K10 3
38 C Intel P4 15.5
39 C Intel core2 5
40 C Intel corei 4
41 C Intel atom 23
42 C VIA nano 4.75
44 ABI_SUPPORT(DOS64)
45 ABI_SUPPORT(STD64)
47 ASM_START()
48 TEXT
49 ALIGN(16)
50 PROLOGUE(mpn_mod_1s_4p)
51 FUNC_ENTRY(4)
52 push %r15
53 push %r14
54 push %r13
55 push %r12
56 push %rbp
57 push %rbx
59 mov %rdx, %r15
60 mov %rcx, %r14
61 mov 16(%rcx), %r11 C B1modb
62 mov 24(%rcx), %rbx C B2modb
63 mov 32(%rcx), %rbp C B3modb
64 mov 40(%rcx), %r13 C B4modb
65 mov 48(%rcx), %r12 C B5modb
66 xor R32(%r8), R32(%r8)
67 mov R32(%rsi), R32(%rdx)
68 and $3, R32(%rdx)
69 je L(b0)
70 cmp $2, R32(%rdx)
71 jc L(b1)
72 je L(b2)
74 L(b3): lea -24(%rdi,%rsi,8), %rdi
75 mov 8(%rdi), %rax
76 mul %r11
77 mov (%rdi), %r9
78 add %rax, %r9
79 adc %rdx, %r8
80 mov 16(%rdi), %rax
81 mul %rbx
82 jmp L(m0)
84 ALIGN(8)
85 L(b0): lea -32(%rdi,%rsi,8), %rdi
86 mov 8(%rdi), %rax
87 mul %r11
88 mov (%rdi), %r9
89 add %rax, %r9
90 adc %rdx, %r8
91 mov 16(%rdi), %rax
92 mul %rbx
93 add %rax, %r9
94 adc %rdx, %r8
95 mov 24(%rdi), %rax
96 mul %rbp
97 jmp L(m0)
99 ALIGN(8)
100 L(b1): lea -8(%rdi,%rsi,8), %rdi
101 mov (%rdi), %r9
102 jmp L(m1)
104 ALIGN(8)
105 L(b2): lea -16(%rdi,%rsi,8), %rdi
106 mov 8(%rdi), %r8
107 mov (%rdi), %r9
108 jmp L(m1)
110 ALIGN(16)
111 L(top): mov -24(%rdi), %rax
112 mov -32(%rdi), %r10
113 mul %r11 C up[1] * B1modb
114 add %rax, %r10
115 mov -16(%rdi), %rax
116 mov $0, R32(%rcx)
117 adc %rdx, %rcx
118 mul %rbx C up[2] * B2modb
119 add %rax, %r10
120 mov -8(%rdi), %rax
121 adc %rdx, %rcx
122 sub $32, %rdi
123 mul %rbp C up[3] * B3modb
124 add %rax, %r10
125 mov %r13, %rax
126 adc %rdx, %rcx
127 mul %r9 C rl * B4modb
128 add %rax, %r10
129 mov %r12, %rax
130 adc %rdx, %rcx
131 mul %r8 C rh * B5modb
132 mov %r10, %r9
133 mov %rcx, %r8
134 L(m0): add %rax, %r9
135 adc %rdx, %r8
136 L(m1): sub $4, %rsi
137 ja L(top)
139 L(end): mov 8(%r14), R32(%rsi)
140 mov %r8, %rax
141 mul %r11
142 mov %rax, %r8
143 add %r9, %r8
144 adc $0, %rdx
145 xor R32(%rcx), R32(%rcx)
146 sub R32(%rsi), R32(%rcx)
147 mov %r8, %rdi
148 shr R8(%rcx), %rdi
149 mov R32(%rsi), R32(%rcx)
150 sal R8(%rcx), %rdx
151 or %rdx, %rdi
152 mov %rdi, %rax
153 mulq (%r14)
154 mov %r15, %rbx
155 mov %rax, %r9
156 sal R8(%rcx), %r8
157 inc %rdi
158 add %r8, %r9
159 adc %rdi, %rdx
160 imul %rbx, %rdx
161 sub %rdx, %r8
162 lea (%r8,%rbx), %rax
163 cmp %r8, %r9
164 cmovc %rax, %r8
165 mov %r8, %rax
166 sub %rbx, %rax
167 cmovc %r8, %rax
168 shr R8(%rcx), %rax
169 pop %rbx
170 pop %rbp
171 pop %r12
172 pop %r13
173 pop %r14
174 pop %r15
175 FUNC_EXIT()
177 EPILOGUE()
179 ALIGN(16)
180 PROLOGUE(mpn_mod_1s_4p_cps)
181 FUNC_ENTRY(2)
182 push %rbp
183 bsr %rsi, %rcx
184 push %rbx
185 mov %rdi, %rbx
186 push %r12
187 xor $63, R32(%rcx)
188 mov %rsi, %r12
189 mov R32(%rcx), R32(%rbp) C preserve cnt over call
190 sal R8(%rcx), %r12 C b << cnt
191 IFSTD(` mov %r12, %rdi ') C pass parameter
192 IFDOS(` mov %r12, %rcx ') C pass parameter
193 ASSERT(nz, `test $15, %rsp')
194 CALL( mpn_invert_limb)
195 mov %r12, %r8
196 mov %rax, %r11
197 mov %rax, (%rbx) C store bi
198 mov %rbp, 8(%rbx) C store cnt
199 neg %r8
200 mov R32(%rbp), R32(%rcx)
201 mov $1, R32(%rsi)
202 ifdef(`SHLD_SLOW',`
203 shl R8(%rcx), %rsi
204 neg R32(%rcx)
205 mov %rax, %rbp
206 shr R8(%rcx), %rax
207 or %rax, %rsi
208 mov %rbp, %rax
209 neg R32(%rcx)
211 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
213 imul %r8, %rsi
214 mul %rsi
216 add %rsi, %rdx
217 shr R8(%rcx), %rsi
218 mov %rsi, 16(%rbx) C store B1modb
220 not %rdx
221 imul %r12, %rdx
222 lea (%rdx,%r12), %rsi
223 cmp %rdx, %rax
224 cmovnc %rdx, %rsi
225 mov %r11, %rax
226 mul %rsi
228 add %rsi, %rdx
229 shr R8(%rcx), %rsi
230 mov %rsi, 24(%rbx) C store B2modb
232 not %rdx
233 imul %r12, %rdx
234 lea (%rdx,%r12), %rsi
235 cmp %rdx, %rax
236 cmovnc %rdx, %rsi
237 mov %r11, %rax
238 mul %rsi
240 add %rsi, %rdx
241 shr R8(%rcx), %rsi
242 mov %rsi, 32(%rbx) C store B3modb
244 not %rdx
245 imul %r12, %rdx
246 lea (%rdx,%r12), %rsi
247 cmp %rdx, %rax
248 cmovnc %rdx, %rsi
249 mov %r11, %rax
250 mul %rsi
252 add %rsi, %rdx
253 shr R8(%rcx), %rsi
254 mov %rsi, 40(%rbx) C store B4modb
256 not %rdx
257 imul %r12, %rdx
258 add %rdx, %r12
259 cmp %rdx, %rax
260 cmovnc %rdx, %r12
262 shr R8(%rcx), %r12
263 mov %r12, 48(%rbx) C store B5modb
265 pop %r12
266 pop %rbx
267 pop %rbp
268 FUNC_EXIT()
270 EPILOGUE()