sync with experimental
[luatex.git] / source / libs / gmp / gmp-6.1.0 / mpn / x86_64 / coreibwl / mul_basecase.asm
blob6365d10a18569ce5f4435584dee79c8b0b5e7723
1 dnl AMD64 mpn_mul_basecase optimised for Intel Broadwell.
3 dnl Copyright 2015 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
33 C cycles/limb mul_1 addmul_1
34 C AMD K8,K9 n/a n/a
35 C AMD K10 n/a n/a
36 C AMD bull n/a n/a
37 C AMD pile n/a n/a
38 C AMD steam n/a n/a
39 C AMD excavator ? ?
40 C AMD bobcat n/a n/a
41 C AMD jaguar n/a n/a
42 C Intel P4 n/a n/a
43 C Intel core2 n/a n/a
44 C Intel NHM n/a n/a
45 C Intel SBR n/a n/a
46 C Intel IBR n/a n/a
47 C Intel HWL 1.68 n/a
48 C Intel BWL 1.69 1.8-1.9
49 C Intel atom n/a n/a
50 C Intel SLM n/a n/a
51 C VIA nano n/a n/a
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
56 C TODO
57 C * Do overlapped software pipelining.
58 C * When changing this, make sure the code which falls into the inner loops
59 C does not execute too many no-ops (for both PIC and non-PIC).
61 define(`rp', `%rdi')
62 define(`up', `%rsi')
63 define(`un_param',`%rdx')
64 define(`vp_param',`%rcx')
65 define(`vn', `%r8')
67 define(`n', `%rcx')
68 define(`n_save', `%rbp')
69 define(`vp', `%r14')
70 define(`unneg', `%rbx')
71 define(`v0', `%rdx')
72 define(`jaddr', `%rax')
74 define(`w0', `%r12')
75 define(`w1', `%r9')
76 define(`w2', `%r10')
77 define(`w3', `%r11')
79 ABI_SUPPORT(DOS64)
80 ABI_SUPPORT(STD64)
82 ASM_START()
83 TEXT
84 ALIGN(16)
85 PROLOGUE(mpn_mul_basecase)
86 FUNC_ENTRY(4)
87 IFDOS(` mov 56(%rsp), %r8d ')
89 cmp $2, un_param
90 ja L(gen)
91 mov (vp_param), %rdx
92 mulx( (up), %rax, %r9) C 0 1
93 je L(s2x)
95 L(s11): mov %rax, (rp)
96 mov %r9, 8(rp)
97 ret
99 L(s2x): cmp $2, vn
100 mulx( 8,(up), %r8, %r10) C 1 2
101 je L(s22)
103 L(s21): add %r8, %r9
104 adc $0, %r10
105 mov %rax, (rp)
106 mov %r9, 8(rp)
107 mov %r10, 16(rp)
110 L(s22): add %r8, %r9 C 1
111 adc $0, %r10 C 2
112 mov 8(vp_param), %rdx
113 mov %rax, (rp)
114 mulx( (up), %r8, %r11) C 1 2
115 mulx( 8,(up), %rax, %rdx) C 2 3
116 add %r11, %rax C 2
117 adc $0, %rdx C 3
118 add %r8, %r9 C 1
119 adc %rax, %r10 C 2
120 adc $0, %rdx C 3
121 mov %r9, 8(rp)
122 mov %r10, 16(rp)
123 mov %rdx, 24(rp)
126 ALIGN(16)
127 L(gen):
128 push %rbx
129 push %rbp
130 push %r12
131 push %r14
133 mov vp_param, vp
134 lea 1(un_param), unneg
135 mov un_param, n_save
136 mov R32(un_param), R32(%rax)
137 and $-8, unneg
138 shr $3, n_save C loop count
139 neg unneg
140 and $7, R32(%rax) C clear CF for adc as side-effect
141 C note that rax lives very long
142 mov n_save, n
143 mov (vp), v0
144 lea 8(vp), vp
146 lea L(mtab)(%rip), %r10
147 ifdef(`PIC',
148 ` movslq (%r10,%rax,4), %r11
149 lea (%r11, %r10), %r10
150 jmp *%r10
152 jmp *(%r10,%rax,8)
155 L(mf0): mulx( (up), w2, w3)
156 lea 56(up), up
157 lea -8(rp), rp
158 jmp L(mb0)
160 L(mf3): mulx( (up), w0, w1)
161 lea 16(up), up
162 lea 16(rp), rp
163 inc n
164 jmp L(mb3)
166 L(mf4): mulx( (up), w2, w3)
167 lea 24(up), up
168 lea 24(rp), rp
169 inc n
170 jmp L(mb4)
172 L(mf5): mulx( (up), w0, w1)
173 lea 32(up), up
174 lea 32(rp), rp
175 inc n
176 jmp L(mb5)
178 L(mf6): mulx( (up), w2, w3)
179 lea 40(up), up
180 lea 40(rp), rp
181 inc n
182 jmp L(mb6)
184 L(mf7): mulx( (up), w0, w1)
185 lea 48(up), up
186 lea 48(rp), rp
187 inc n
188 jmp L(mb7)
190 L(mf1): mulx( (up), w0, w1)
191 jmp L(mb1)
193 L(mf2): mulx( (up), w2, w3)
194 lea 8(up), up
195 lea 8(rp), rp
196 mulx( (up), w0, w1)
198 ALIGN(16)
199 L(m1top):
200 mov w2, -8(rp)
201 adc w3, w0
202 L(mb1): mulx( 8,(up), w2, w3)
203 adc w1, w2
204 lea 64(up), up
205 mov w0, (rp)
206 L(mb0): mov w2, 8(rp)
207 mulx( -48,(up), w0, w1)
208 lea 64(rp), rp
209 adc w3, w0
210 L(mb7): mulx( -40,(up), w2, w3)
211 mov w0, -48(rp)
212 adc w1, w2
213 L(mb6): mov w2, -40(rp)
214 mulx( -32,(up), w0, w1)
215 adc w3, w0
216 L(mb5): mulx( -24,(up), w2, w3)
217 mov w0, -32(rp)
218 adc w1, w2
219 L(mb4): mulx( -16,(up), w0, w1)
220 mov w2, -24(rp)
221 adc w3, w0
222 L(mb3): mulx( -8,(up), w2, w3)
223 adc w1, w2
224 mov w0, -16(rp)
225 dec n
226 mulx( (up), w0, w1)
227 jnz L(m1top)
229 L(m1end):
230 mov w2, -8(rp)
231 adc w3, w0
232 mov w0, (rp)
233 adc %rcx, w1 C relies on rcx = 0
234 mov w1, 8(rp)
236 dec vn
237 jz L(done)
239 lea L(atab)(%rip), %r10
240 ifdef(`PIC',
241 ` movslq (%r10,%rax,4), %rax
242 lea (%rax, %r10), jaddr
244 mov (%r10,%rax,8), jaddr
247 L(outer):
248 lea (up,unneg,8), up
249 mov n_save, n
250 mov (vp), v0
251 lea 8(vp), vp
252 jmp *jaddr
254 L(f0): mulx( 8,(up), w2, w3)
255 lea 8(rp,unneg,8), rp
256 lea -1(n), n
257 jmp L(b0)
259 L(f3): mulx( -16,(up), w0, w1)
260 lea -56(rp,unneg,8), rp
261 jmp L(b3)
263 L(f4): mulx( -24,(up), w2, w3)
264 lea -56(rp,unneg,8), rp
265 jmp L(b4)
267 L(f5): mulx( -32,(up), w0, w1)
268 lea -56(rp,unneg,8), rp
269 jmp L(b5)
271 L(f6): mulx( -40,(up), w2, w3)
272 lea -56(rp,unneg,8), rp
273 jmp L(b6)
275 L(f7): mulx( 16,(up), w0, w1)
276 lea 8(rp,unneg,8), rp
277 jmp L(b7)
279 L(f1): mulx( (up), w0, w1)
280 lea 8(rp,unneg,8), rp
281 jmp L(b1)
283 L(am1end):
284 adox( (rp), w0)
285 adox( %rcx, w1) C relies on rcx = 0
286 mov w0, (rp)
287 adc %rcx, w1 C relies on rcx = 0
288 mov w1, 8(rp)
290 dec vn C clear CF and OF as side-effect
291 jnz L(outer)
292 L(done):
293 pop %r14
294 pop %r12
295 pop %rbp
296 pop %rbx
299 L(f2):
300 mulx( -8,(up), w2, w3)
301 lea 8(rp,unneg,8), rp
302 mulx( (up), w0, w1)
304 ALIGN(16)
305 L(am1top):
306 adox( -8,(rp), w2)
307 adcx( w3, w0)
308 mov w2, -8(rp)
309 jrcxz L(am1end)
310 L(b1): mulx( 8,(up), w2, w3)
311 adox( (rp), w0)
312 lea -1(n), n
313 mov w0, (rp)
314 adcx( w1, w2)
315 L(b0): mulx( 16,(up), w0, w1)
316 adcx( w3, w0)
317 adox( 8,(rp), w2)
318 mov w2, 8(rp)
319 L(b7): mulx( 24,(up), w2, w3)
320 lea 64(up), up
321 adcx( w1, w2)
322 adox( 16,(rp), w0)
323 mov w0, 16(rp)
324 L(b6): mulx( -32,(up), w0, w1)
325 adox( 24,(rp), w2)
326 adcx( w3, w0)
327 mov w2, 24(rp)
328 L(b5): mulx( -24,(up), w2, w3)
329 adcx( w1, w2)
330 adox( 32,(rp), w0)
331 mov w0, 32(rp)
332 L(b4): mulx( -16,(up), w0, w1)
333 adox( 40,(rp), w2)
334 adcx( w3, w0)
335 mov w2, 40(rp)
336 L(b3): adox( 48,(rp), w0)
337 mulx( -8,(up), w2, w3)
338 mov w0, 48(rp)
339 lea 64(rp), rp
340 adcx( w1, w2)
341 mulx( (up), w0, w1)
342 jmp L(am1top)
344 JUMPTABSECT
345 ALIGN(8)
346 L(mtab):JMPENT( L(mf0), L(mtab))
347 JMPENT( L(mf1), L(mtab))
348 JMPENT( L(mf2), L(mtab))
349 JMPENT( L(mf3), L(mtab))
350 JMPENT( L(mf4), L(mtab))
351 JMPENT( L(mf5), L(mtab))
352 JMPENT( L(mf6), L(mtab))
353 JMPENT( L(mf7), L(mtab))
354 L(atab):JMPENT( L(f0), L(atab))
355 JMPENT( L(f1), L(atab))
356 JMPENT( L(f2), L(atab))
357 JMPENT( L(f3), L(atab))
358 JMPENT( L(f4), L(atab))
359 JMPENT( L(f5), L(atab))
360 JMPENT( L(f6), L(atab))
361 JMPENT( L(f7), L(atab))
362 TEXT
363 EPILOGUE()