beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreihwl / mul_1.asm
blob1e3c338f4ee3b6d1c67a573ee8853f137589b771
1 dnl AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2012, 2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb best
36 C AMD K8,K9 n/a
37 C AMD K10 n/a
38 C AMD bd1 n/a
39 C AMD bd2 ?
40 C AMD bobcat n/a
41 C AMD jaguar ?
42 C Intel P4 n/a
43 C Intel PNR n/a
44 C Intel NHM n/a
45 C Intel SBR n/a
46 C Intel IBR n/a
47 C Intel HWL 1.57 this
48 C Intel BWL ?
49 C Intel atom n/a
50 C VIA nano n/a
52 C The loop of this code is the result of running a code generation and
53 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
55 define(`rp', `%rdi') C rcx
56 define(`up', `%rsi') C rdx
57 define(`n_param', `%rdx') C r8
58 define(`v0_param',`%rcx') C r9
60 define(`n', `%rbp')
61 define(`v0', `%rdx')
63 ABI_SUPPORT(DOS64)
64 ABI_SUPPORT(STD64)
66 ASM_START()
67 TEXT
68 ALIGN(32)
69 PROLOGUE(mpn_mul_1)
70 FUNC_ENTRY(4)
71 push %rbx
72 push %rbp
73 push %r12
75 mov n_param, n
76 shr $2, n
78 test $1, R8(n_param)
79 jnz L(bx1)
81 L(bx0): test $2, R8(n_param)
82 mov v0_param, v0
83 jnz L(b10)
85 L(b00): mulx( (up), %r9, %r8)
86 mulx( 8,(up), %r11, %r10)
87 mulx( 16,(up), %rcx, %r12)
88 lea -32(rp), rp
89 jmp L(lo0)
91 L(b10): mulx( (up), %rcx, %r12)
92 mulx( 8,(up), %rbx, %rax)
93 lea -16(rp), rp
94 test n, n
95 jz L(cj2)
96 mulx( 16,(up), %r9, %r8)
97 lea 16(up), up
98 jmp L(lo2)
100 L(bx1): test $2, R8(n_param)
101 mov v0_param, v0
102 jnz L(b11)
104 L(b01): mulx( (up), %rbx, %rax)
105 lea -24(rp), rp
106 test n, n
107 jz L(cj1)
108 mulx( 8,(up), %r9, %r8)
109 lea 8(up), up
110 jmp L(lo1)
112 L(b11): mulx( (up), %r11, %r10)
113 mulx( 8,(up), %rcx, %r12)
114 mulx( 16,(up), %rbx, %rax)
115 lea -8(rp), rp
116 test n, n
117 jz L(cj3)
118 lea 24(up), up
119 jmp L(lo3)
121 ALIGN(32)
122 L(top): lea 32(rp), rp
123 mov %r9, (rp)
124 adc %r8, %r11
125 L(lo3): mulx( (up), %r9, %r8)
126 mov %r11, 8(rp)
127 adc %r10, %rcx
128 L(lo2): mov %rcx, 16(rp)
129 adc %r12, %rbx
130 L(lo1): mulx( 8,(up), %r11, %r10)
131 adc %rax, %r9
132 mulx( 16,(up), %rcx, %r12)
133 mov %rbx, 24(rp)
134 L(lo0): mulx( 24,(up), %rbx, %rax)
135 lea 32(up), up
136 dec n
137 jnz L(top)
139 L(end): lea 32(rp), rp
140 mov %r9, (rp)
141 adc %r8, %r11
142 L(cj3): mov %r11, 8(rp)
143 adc %r10, %rcx
144 L(cj2): mov %rcx, 16(rp)
145 adc %r12, %rbx
146 L(cj1): mov %rbx, 24(rp)
147 adc $0, %rax
149 pop %r12
150 pop %rbp
151 pop %rbx
152 FUNC_EXIT()
154 EPILOGUE()
155 ASM_END()