beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreihwl / mul_basecase.asm
blobb2656c8e9bde17dd228b6858d12b610f3de9ab85
1 dnl AMD64 mpn_mul_basecase optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_1 mul_2 mul_3 addmul_2
36 C AMD K8,K9 n/a n/a - n/a
37 C AMD K10 n/a n/a - n/a
38 C AMD bull n/a n/a - n/a
39 C AMD pile n/a n/a - n/a
40 C AMD steam ? ? - ?
41 C AMD bobcat n/a n/a - n/a
42 C AMD jaguar ? ? - ?
43 C Intel P4 n/a n/a - n/a
44 C Intel core n/a n/a - n/a
45 C Intel NHM n/a n/a - n/a
46 C Intel SBR n/a n/a - n/a
47 C Intel IBR n/a n/a - n/a
48 C Intel HWL 1.77 1.86 - 2.15
49 C Intel BWL ? ? - ?
50 C Intel atom n/a n/a - n/a
51 C VIA nano n/a n/a - n/a
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Adjoin a mul_3.
58 C * Further micro-optimise.
60 define(`rp', `%rdi')
61 define(`up', `%rsi')
62 define(`un_param',`%rdx')
63 define(`vp', `%rcx')
64 define(`vn', `%r8')
66 define(`un', `%rbx')
68 define(`w0', `%r10')
69 define(`w1', `%r11')
70 define(`w2', `%r12')
71 define(`w3', `%r13')
72 define(`n', `%rbp')
73 define(`v0', `%r9')
75 ABI_SUPPORT(DOS64)
76 ABI_SUPPORT(STD64)
78 ASM_START()
79 TEXT
80 ALIGN(16)
81 PROLOGUE(mpn_mul_basecase)
82 FUNC_ENTRY(4)
83 IFDOS(` mov 56(%rsp), %r8d ')
84 push %rbx
85 push %rbp
86 push %r12
87 push %r13
88 push %r14
89 mov un_param, un C free up rdx
90 neg un
92 mov un_param, n C FIXME: share
93 sar $2, n C FIXME: share
95 test $1, R8(vn)
96 jz L(do_mul_2)
98 define(`w4', `%r9')
99 define(`w5', `%r14')
101 mov (vp), %rdx
103 L(do_mul_1):
104 test $1, R8(un)
105 jnz L(m1x1)
107 L(m1x0):test $2, R8(un)
108 jnz L(m110)
110 L(m100):
111 mulx( (up), w5, w2)
112 mulx( 8,(up), w1, w3)
113 lea -24(rp), rp
114 jmp L(m1l0)
116 L(m110):
117 mulx( (up), w3, w4)
118 mulx( 8,(up), w1, w5)
119 lea -8(rp), rp
120 test n, n
121 jz L(cj2)
122 mulx( 16,(up), w0, w2)
123 lea 16(up), up
124 jmp L(m1l2)
126 L(m1x1):test $2, R8(un)
127 jz L(m111)
129 L(m101):
130 mulx( (up), w4, w5)
131 lea -16(rp), rp
132 test n, n
133 jz L(cj1)
134 mulx( 8,(up), w0, w2)
135 lea 8(up), up
136 jmp L(m1l1)
138 L(m111):
139 mulx( (up), w2, w3)
140 mulx( 8,(up), w0, w4)
141 mulx( 16,(up), w1, w5)
142 lea 24(up), up
143 test n, n
144 jnz L(gt3)
145 add w0, w3
146 jmp L(cj3)
147 L(gt3): add w0, w3
148 jmp L(m1l3)
150 ALIGN(32)
151 L(m1tp):lea 32(rp), rp
152 L(m1l3):mov w2, (rp)
153 mulx( (up), w0, w2)
154 L(m1l2):mov w3, 8(rp)
155 adc w1, w4
156 L(m1l1):adc w0, w5
157 mov w4, 16(rp)
158 mulx( 8,(up), w1, w3)
159 L(m1l0):mov w5, 24(rp)
160 mulx( 16,(up), w0, w4)
161 adc w1, w2
162 mulx( 24,(up), w1, w5)
163 adc w0, w3
164 lea 32(up), up
165 dec n
166 jnz L(m1tp)
168 L(m1ed):lea 32(rp), rp
169 L(cj3): mov w2, (rp)
170 L(cj2): mov w3, 8(rp)
171 adc w1, w4
172 L(cj1): mov w4, 16(rp)
173 adc $0, w5
174 mov w5, 24(rp)
176 dec R32(vn)
177 jz L(ret5)
179 lea 8(vp), vp
180 lea 32(rp), rp
181 C push %r12
182 C push %r13
183 C push %r14
184 jmp L(do_addmul)
186 L(do_mul_2):
187 define(`v1', `%r14')
188 C push %r12
189 C push %r13
190 C push %r14
192 mov (vp), v0
193 mov 8(vp), v1
195 lea (un), n
196 sar $2, n
198 test $1, R8(un)
199 jnz L(m2x1)
201 L(m2x0):xor w0, w0
202 test $2, R8(un)
203 mov (up), %rdx
204 mulx( v0, w2, w1)
205 jz L(m2l0)
207 L(m210):lea -16(rp), rp
208 lea -16(up), up
209 jmp L(m2l2)
211 L(m2x1):xor w2, w2
212 test $2, R8(un)
213 mov (up), %rdx
214 mulx( v0, w0, w3)
215 jz L(m211)
217 L(m201):lea -24(rp), rp
218 lea 8(up), up
219 jmp L(m2l1)
221 L(m211):lea -8(rp), rp
222 lea -8(up), up
223 jmp L(m2l3)
225 ALIGN(16)
226 L(m2tp):mulx( v1, %rax, w0)
227 add %rax, w2
228 mov (up), %rdx
229 mulx( v0, %rax, w1)
230 adc $0, w0
231 add %rax, w2
232 adc $0, w1
233 add w3, w2
234 L(m2l0):mov w2, (rp)
235 adc $0, w1
236 mulx( v1, %rax, w2)
237 add %rax, w0
238 mov 8(up), %rdx
239 adc $0, w2
240 mulx( v0, %rax, w3)
241 add %rax, w0
242 adc $0, w3
243 add w1, w0
244 L(m2l3):mov w0, 8(rp)
245 adc $0, w3
246 mulx( v1, %rax, w0)
247 add %rax, w2
248 mov 16(up), %rdx
249 mulx( v0, %rax, w1)
250 adc $0, w0
251 add %rax, w2
252 adc $0, w1
253 add w3, w2
254 L(m2l2):mov w2, 16(rp)
255 adc $0, w1
256 mulx( v1, %rax, w2)
257 add %rax, w0
258 mov 24(up), %rdx
259 adc $0, w2
260 mulx( v0, %rax, w3)
261 add %rax, w0
262 adc $0, w3
263 add w1, w0
264 lea 32(up), up
265 L(m2l1):mov w0, 24(rp)
266 adc $0, w3
267 inc n
268 lea 32(rp), rp
269 jnz L(m2tp)
271 L(m2ed):mulx( v1, %rdx, %rax)
272 add %rdx, w2
273 adc $0, %rax
274 add w3, w2
275 mov w2, (rp)
276 adc $0, %rax
277 mov %rax, 8(rp)
279 add $-2, R32(vn)
280 jz L(ret5)
281 lea 16(vp), vp
282 lea 16(rp), rp
285 L(do_addmul):
286 push %r15
287 push vn C save vn in new stack slot
288 define(`vn', `(%rsp)')
289 define(`X0', `%r14')
290 define(`X1', `%r15')
291 define(`v1', `%r8')
293 lea (rp,un,8), rp
294 lea (up,un,8), up
296 L(outer):
297 mov (vp), v0
298 mov 8(vp), v1
300 lea 2(un), n
301 sar $2, n
303 mov (up), %rdx
304 test $1, R8(un)
305 jnz L(bx1)
307 L(bx0): mov (rp), X0
308 mov 8(rp), X1
309 mulx( v0, %rax, w1)
310 add %rax, X0
311 mulx( v1, %rax, w2)
312 adc $0, w1
313 mov X0, (rp)
314 add %rax, X1
315 adc $0, w2
316 mov 8(up), %rdx
317 test $2, R8(un)
318 jnz L(b10)
320 L(b00): lea 16(up), up
321 lea 16(rp), rp
322 jmp L(lo0)
324 L(b10): mov 16(rp), X0
325 lea 32(up), up
326 mulx( v0, %rax, w3)
327 jmp L(lo2)
329 L(bx1): mov (rp), X1
330 mov 8(rp), X0
331 mulx( v0, %rax, w3)
332 add %rax, X1
333 adc $0, w3
334 mulx( v1, %rax, w0)
335 add %rax, X0
336 adc $0, w0
337 mov 8(up), %rdx
338 mov X1, (rp)
339 mulx( v0, %rax, w1)
340 test $2, R8(un)
341 jz L(b11)
343 L(b01): mov 16(rp), X1
344 lea 24(rp), rp
345 lea 24(up), up
346 jmp L(lo1)
348 L(b11): lea 8(rp), rp
349 lea 8(up), up
350 jmp L(lo3)
352 ALIGN(16)
353 L(top): mulx( v0, %rax, w3)
354 add w0, X1
355 adc $0, w2
356 L(lo2): add %rax, X1
357 adc $0, w3
358 mulx( v1, %rax, w0)
359 add %rax, X0
360 adc $0, w0
361 lea 32(rp), rp
362 add w1, X1
363 mov -16(up), %rdx
364 mov X1, -24(rp)
365 adc $0, w3
366 add w2, X0
367 mov -8(rp), X1
368 mulx( v0, %rax, w1)
369 adc $0, w0
370 L(lo1): add %rax, X0
371 mulx( v1, %rax, w2)
372 adc $0, w1
373 add w3, X0
374 mov X0, -16(rp)
375 adc $0, w1
376 add %rax, X1
377 adc $0, w2
378 add w0, X1
379 mov -8(up), %rdx
380 adc $0, w2
381 L(lo0): mulx( v0, %rax, w3)
382 add %rax, X1
383 adc $0, w3
384 mov (rp), X0
385 mulx( v1, %rax, w0)
386 add %rax, X0
387 adc $0, w0
388 add w1, X1
389 mov X1, -8(rp)
390 adc $0, w3
391 mov (up), %rdx
392 add w2, X0
393 mulx( v0, %rax, w1)
394 adc $0, w0
395 L(lo3): add %rax, X0
396 adc $0, w1
397 mulx( v1, %rax, w2)
398 add w3, X0
399 mov 8(rp), X1
400 mov X0, (rp)
401 mov 16(rp), X0
402 adc $0, w1
403 add %rax, X1
404 adc $0, w2
405 mov 8(up), %rdx
406 lea 32(up), up
407 inc n
408 jnz L(top)
410 L(end): mulx( v0, %rax, w3)
411 add w0, X1
412 adc $0, w2
413 add %rax, X1
414 adc $0, w3
415 mulx( v1, %rdx, %rax)
416 add w1, X1
417 mov X1, 8(rp)
418 adc $0, w3
419 add w2, %rdx
420 adc $0, %rax
421 add w3, %rdx
422 mov %rdx, 16(rp)
423 adc $0, %rax
424 mov %rax, 24(rp)
426 addl $-2, vn
427 lea 16(vp), vp
428 lea -16(up,un,8), up
429 lea 32(rp,un,8), rp
430 jnz L(outer)
432 pop %rax C deallocate vn slot
433 pop %r15
434 L(ret5):pop %r14
435 L(ret4):pop %r13
436 L(ret3):pop %r12
437 L(ret2):pop %rbp
438 pop %rbx
439 FUNC_EXIT()
441 EPILOGUE()