beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / mod_1_2.asm
blob09d856e6e6b01f278b47b8f00d233c641bd36754
1 dnl AMD64 mpn_mod_1s_2p
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2009-2012, 2014 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 4
37 C AMD K10 4
38 C Intel P4 19
39 C Intel core2 8
40 C Intel NHM 6.5
41 C Intel SBR 4.5
42 C Intel atom 28
43 C VIA nano 8
45 ABI_SUPPORT(DOS64)
46 ABI_SUPPORT(STD64)
48 ASM_START()
49 TEXT
50 ALIGN(16)
51 PROLOGUE(mpn_mod_1s_2p)
52 FUNC_ENTRY(4)
53 push %r14
54 test $1, R8(%rsi)
55 mov %rdx, %r14
56 push %r13
57 mov %rcx, %r13
58 push %r12
59 push %rbp
60 push %rbx
61 mov 16(%rcx), %r10
62 mov 24(%rcx), %rbx
63 mov 32(%rcx), %rbp
64 je L(b0)
65 dec %rsi
66 je L(one)
67 mov -8(%rdi,%rsi,8), %rax
68 mul %r10
69 mov %rax, %r9
70 mov %rdx, %r8
71 mov (%rdi,%rsi,8), %rax
72 add -16(%rdi,%rsi,8), %r9
73 adc $0, %r8
74 mul %rbx
75 add %rax, %r9
76 adc %rdx, %r8
77 jmp L(11)
79 L(b0): mov -8(%rdi,%rsi,8), %r8
80 mov -16(%rdi,%rsi,8), %r9
82 L(11): sub $4, %rsi
83 jb L(ed2)
84 lea 40(%rdi,%rsi,8), %rdi
85 mov -40(%rdi), %r11
86 mov -32(%rdi), %rax
87 jmp L(m0)
89 ALIGN(16)
90 L(top): mov -24(%rdi), %r9
91 add %rax, %r11
92 mov -16(%rdi), %rax
93 adc %rdx, %r12
94 mul %r10
95 add %rax, %r9
96 mov %r11, %rax
97 mov %rdx, %r8
98 adc $0, %r8
99 mul %rbx
100 add %rax, %r9
101 mov %r12, %rax
102 adc %rdx, %r8
103 mul %rbp
104 sub $2, %rsi
105 jb L(ed1)
106 mov -40(%rdi), %r11
107 add %rax, %r9
108 mov -32(%rdi), %rax
109 adc %rdx, %r8
110 L(m0): mul %r10
111 add %rax, %r11
112 mov %r9, %rax
113 mov %rdx, %r12
114 adc $0, %r12
115 mul %rbx
116 add %rax, %r11
117 lea -32(%rdi), %rdi C ap -= 4
118 mov %r8, %rax
119 adc %rdx, %r12
120 mul %rbp
121 sub $2, %rsi
122 jae L(top)
124 L(ed0): mov %r11, %r9
125 mov %r12, %r8
126 L(ed1): add %rax, %r9
127 adc %rdx, %r8
128 L(ed2): mov 8(%r13), R32(%rdi) C cnt
129 mov %r8, %rax
130 mov %r9, %r8
131 mul %r10
132 add %rax, %r8
133 adc $0, %rdx
134 L(1): xor R32(%rcx), R32(%rcx)
135 mov %r8, %r9
136 sub R32(%rdi), R32(%rcx)
137 shr R8(%rcx), %r9
138 mov R32(%rdi), R32(%rcx)
139 sal R8(%rcx), %rdx
140 or %rdx, %r9
141 sal R8(%rcx), %r8
142 mov %r9, %rax
143 mulq (%r13)
144 mov %rax, %rsi
145 inc %r9
146 add %r8, %rsi
147 adc %r9, %rdx
148 imul %r14, %rdx
149 sub %rdx, %r8
150 lea (%r8,%r14), %rax
151 cmp %r8, %rsi
152 cmovc %rax, %r8
153 mov %r8, %rax
154 sub %r14, %rax
155 cmovc %r8, %rax
156 mov R32(%rdi), R32(%rcx)
157 shr R8(%rcx), %rax
158 pop %rbx
159 pop %rbp
160 pop %r12
161 pop %r13
162 pop %r14
163 FUNC_EXIT()
165 L(one):
166 mov (%rdi), %r8
167 mov 8(%rcx), R32(%rdi)
168 xor %rdx, %rdx
169 jmp L(1)
170 EPILOGUE()
172 ALIGN(16)
173 PROLOGUE(mpn_mod_1s_2p_cps)
174 FUNC_ENTRY(2)
175 push %rbp
176 bsr %rsi, %rcx
177 push %rbx
178 mov %rdi, %rbx
179 push %r12
180 xor $63, R32(%rcx)
181 mov %rsi, %r12
182 mov R32(%rcx), R32(%rbp) C preserve cnt over call
183 sal R8(%rcx), %r12 C b << cnt
184 IFSTD(` mov %r12, %rdi ') C pass parameter
185 IFDOS(` mov %r12, %rcx ') C pass parameter
186 ASSERT(nz, `test $15, %rsp')
187 CALL( mpn_invert_limb)
188 mov %r12, %r8
189 mov %rax, %r11
190 mov %rax, (%rbx) C store bi
191 mov %rbp, 8(%rbx) C store cnt
192 neg %r8
193 mov R32(%rbp), R32(%rcx)
194 mov $1, R32(%rsi)
195 ifdef(`SHLD_SLOW',`
196 shl R8(%rcx), %rsi
197 neg R32(%rcx)
198 mov %rax, %rbp
199 shr R8(%rcx), %rax
200 or %rax, %rsi
201 mov %rbp, %rax
202 neg R32(%rcx)
204 shld R8(%rcx), %rax, %rsi C FIXME: Slow on Atom and Nano
206 imul %r8, %rsi
207 mul %rsi
209 add %rsi, %rdx
210 shr R8(%rcx), %rsi
211 mov %rsi, 16(%rbx) C store B1modb
213 not %rdx
214 imul %r12, %rdx
215 lea (%rdx,%r12), %rsi
216 cmp %rdx, %rax
217 cmovnc %rdx, %rsi
218 mov %r11, %rax
219 mul %rsi
221 add %rsi, %rdx
222 shr R8(%rcx), %rsi
223 mov %rsi, 24(%rbx) C store B2modb
225 not %rdx
226 imul %r12, %rdx
227 add %rdx, %r12
228 cmp %rdx, %rax
229 cmovnc %rdx, %r12
231 shr R8(%rcx), %r12
232 mov %r12, 32(%rbx) C store B3modb
234 pop %r12
235 pop %rbx
236 pop %rbp
237 FUNC_EXIT()
239 EPILOGUE()