beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreihwl / addmul_2.asm
blob54aebc888d75d300d25cbf862a496b10ff4698a2
1 dnl AMD64 mpn_addmul_2 optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 n/a
37 C AMD K10 n/a
38 C AMD bull n/a
39 C AMD pile n/a
40 C AMD steam ?
41 C AMD bobcat n/a
42 C AMD jaguar ?
43 C Intel P4 n/a
44 C Intel core n/a
45 C Intel NHM n/a
46 C Intel SBR n/a
47 C Intel IBR n/a
48 C Intel HWL 2.15
49 C Intel BWL ?
50 C Intel atom n/a
51 C VIA nano n/a
53 C The loop of this code is the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 define(`rp', `%rdi')
57 define(`up', `%rsi')
58 define(`n_param',`%rdx')
59 define(`vp', `%rcx')
61 define(`v0', `%r8')
62 define(`v1', `%r9')
63 define(`w0', `%rbx')
64 define(`w1', `%rcx')
65 define(`w2', `%rbp')
66 define(`w3', `%r10')
67 define(`n', `%r11')
68 define(`X0', `%r12')
69 define(`X1', `%r13')
71 ABI_SUPPORT(DOS64)
72 ABI_SUPPORT(STD64)
74 ASM_START()
75 TEXT
76 ALIGN(32)
77 PROLOGUE(mpn_addmul_2)
78 FUNC_ENTRY(4)
79 push %rbx
80 push %rbp
81 push %r12
82 push %r13
84 mov (vp), v0
85 mov 8(vp), v1
87 mov n_param, n
88 shr $2, n
90 test $1, R8(n_param)
91 jnz L(bx1)
93 L(bx0): mov (rp), X0
94 mov 8(rp), X1
95 test $2, R8(n_param)
96 jnz L(b10)
98 L(b00): mov (up), %rdx
99 lea 16(up), up
100 mulx( v0, %rax, w1)
101 add %rax, X0
102 mulx( v1, %rax, w2)
103 adc $0, w1
104 mov X0, (rp)
105 add %rax, X1
106 adc $0, w2
107 mov -8(up), %rdx
108 lea 16(rp), rp
109 jmp L(lo0)
111 L(b10): mov (up), %rdx
112 inc n
113 mulx( v0, %rax, w1)
114 add %rax, X0
115 adc $0, w1
116 mulx( v1, %rax, w2)
117 mov X0, (rp)
118 mov 16(rp), X0
119 add %rax, X1
120 adc $0, w2
121 xor w0, w0
122 jmp L(lo2)
124 L(bx1): mov (rp), X1
125 mov 8(rp), X0
126 test $2, R8(n_param)
127 jnz L(b11)
129 L(b01): mov (up), %rdx
130 mulx( v0, %rax, w3)
131 add %rax, X1
132 adc $0, w3
133 mulx( v1, %rax, w0)
134 add %rax, X0
135 adc $0, w0
136 mov 8(up), %rdx
137 mov X1, (rp)
138 mov 16(rp), X1
139 mulx( v0, %rax, w1)
140 lea 24(rp), rp
141 lea 24(up), up
142 jmp L(lo1)
144 L(b11): mov (up), %rdx
145 inc n
146 mulx( v0, %rax, w3)
147 add %rax, X1
148 adc $0, w3
149 mulx( v1, %rax, w0)
150 add %rax, X0
151 adc $0, w0
152 mov X1, (rp)
153 mov 8(up), %rdx
154 mulx( v0, %rax, w1)
155 lea 8(rp), rp
156 lea 8(up), up
157 jmp L(lo3)
159 ALIGN(16)
160 L(top): mulx( v0, %rax, w3)
161 add w0, X1
162 adc $0, w2
163 add %rax, X1
164 adc $0, w3
165 mulx( v1, %rax, w0)
166 add %rax, X0
167 adc $0, w0
168 lea 32(rp), rp
169 add w1, X1
170 mov -16(up), %rdx
171 mov X1, -24(rp)
172 adc $0, w3
173 add w2, X0
174 mov -8(rp), X1
175 mulx( v0, %rax, w1)
176 adc $0, w0
177 L(lo1): add %rax, X0
178 mulx( v1, %rax, w2)
179 adc $0, w1
180 add w3, X0
181 mov X0, -16(rp)
182 adc $0, w1
183 add %rax, X1
184 adc $0, w2
185 add w0, X1
186 mov -8(up), %rdx
187 adc $0, w2
188 L(lo0): mulx( v0, %rax, w3)
189 add %rax, X1
190 adc $0, w3
191 mov (rp), X0
192 mulx( v1, %rax, w0)
193 add %rax, X0
194 adc $0, w0
195 add w1, X1
196 mov X1, -8(rp)
197 adc $0, w3
198 mov (up), %rdx
199 add w2, X0
200 mulx( v0, %rax, w1)
201 adc $0, w0
202 L(lo3): add %rax, X0
203 adc $0, w1
204 mulx( v1, %rax, w2)
205 add w3, X0
206 mov 8(rp), X1
207 mov X0, (rp)
208 mov 16(rp), X0
209 adc $0, w1
210 add %rax, X1
211 adc $0, w2
212 L(lo2): mov 8(up), %rdx
213 lea 32(up), up
214 dec n
215 jnz L(top)
217 L(end): mulx( v0, %rax, w3)
218 add w0, X1
219 adc $0, w2
220 add %rax, X1
221 adc $0, w3
222 mulx( v1, %rdx, %rax)
223 add w1, X1
224 mov X1, 8(rp)
225 adc $0, w3
226 add w2, %rdx
227 adc $0, %rax
228 add w3, %rdx
229 mov %rdx, 16(rp)
230 adc $0, %rax
232 pop %r13
233 pop %r12
234 pop %rbp
235 pop %rbx
236 FUNC_EXIT()
238 EPILOGUE()