beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / bd1 / mul_2.asm
blob4ed5f3056145f4714fa5f8ad89ad9f58a219a7fb
1 dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull 4.36 average, quite fluctuating
39 C AMD pile 4.38 slighty fluctuating
40 C AMD steam
41 C AMD bobcat
42 C AMD jaguar
43 C Intel P4
44 C Intel core
45 C Intel NHM
46 C Intel SBR
47 C Intel IBR
48 C Intel HWL
49 C Intel BWL
50 C Intel atom
51 C VIA nano
53 C The loop of this code is the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
55 C Scheme: genxmul --mul
57 define(`rp', `%rdi') C rcx
58 define(`up', `%rsi') C rdx
59 define(`n_param', `%rdx') C r8
60 define(`vp', `%rcx') C r9
62 define(`v0', `%r8')
63 define(`v1', `%r9')
64 define(`w0', `%rbx')
65 define(`w1', `%rcx')
66 define(`w2', `%rbp')
67 define(`w3', `%r10')
68 define(`n', `%r11')
70 ABI_SUPPORT(DOS64)
71 ABI_SUPPORT(STD64)
73 ASM_START()
74 TEXT
75 ALIGN(32)
76 PROLOGUE(mpn_mul_2)
77 FUNC_ENTRY(4)
78 push %rbx
79 push %rbp
81 mov (up), %rax
83 mov (vp), v0
84 mov 8(vp), v1
86 lea (up,n_param,8), up
87 lea (rp,n_param,8), rp
89 mov n_param, n
90 mul v0
91 neg n
93 test $1, R8(n)
94 jnz L(bx1)
96 L(bx0): test $2, R8(n)
97 jnz L(b10)
99 L(b00): mov %rax, w0
100 mov %rdx, w1
101 xor R32(w2), R32(w2)
102 mov (up,n,8), %rax
103 jmp L(lo0)
105 L(b10): mov %rax, w2
106 mov %rdx, w3
107 mov (up,n,8), %rax
108 xor R32(w0), R32(w0)
109 mul v1
110 add $-2, n
111 jmp L(lo2)
113 L(bx1): test $2, R8(n)
114 jz L(b11)
116 L(b01): mov %rax, w3
117 mov %rdx, w0
118 mov (up,n,8), %rax
119 mul v1
120 xor R32(w1), R32(w1)
121 inc n
122 jmp L(lo1)
124 L(b11): mov %rax, w1
125 mov %rdx, w2
126 mov (up,n,8), %rax
127 xor R32(w3), R32(w3)
128 dec n
129 jmp L(lo3)
131 ALIGN(32)
132 L(top): mov -8(up,n,8), %rax
133 mul v1
134 mov w2, -16(rp,n,8)
135 L(lo1): add %rax, w0
136 mov w3, -8(rp,n,8)
137 adc %rdx, w1
138 mov (up,n,8), %rax
139 mul v0
140 mov $0, R32(w2)
141 add %rax, w0
142 adc %rdx, w1
143 adc $0, R32(w2)
144 mov (up,n,8), %rax
145 L(lo0): mul v1
146 add %rax, w1
147 adc %rdx, w2
148 mov 8(up,n,8), %rax
149 mul v0
150 add %rax, w1
151 mov w0, (rp,n,8)
152 mov $0, R32(w3)
153 mov 8(up,n,8), %rax
154 adc %rdx, w2
155 adc $0, R32(w3)
156 L(lo3): mul v1
157 add %rax, w2
158 mov 16(up,n,8), %rax
159 adc %rdx, w3
160 mul v0
161 add %rax, w2
162 mov 16(up,n,8), %rax
163 mov $0, R32(w0)
164 adc %rdx, w3
165 adc $0, R32(w0)
166 mul v1
167 mov w1, 8(rp,n,8)
168 L(lo2): add %rax, w3
169 adc %rdx, w0
170 mov 24(up,n,8), %rax
171 mul v0
172 add %rax, w3
173 adc %rdx, w0
174 mov $0, R32(w1)
175 adc $0, R32(w1)
176 add $4, n
177 jnc L(top)
179 L(end): mov -8(up,n,8), %rax
180 mul v1
181 mov w2, -16(rp,n,8)
182 add %rax, w0
183 mov w3, -8(rp,n,8)
184 adc %rdx, w1
185 mov w0, (rp,n,8)
186 mov w1, %rax
188 pop %rbp
189 pop %rbx
190 FUNC_EXIT()
192 EPILOGUE()