beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / addmul_2.asm
blob21f0bf465f6423845f4aeb4fd9137f1acd98cb0a
1 dnl AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb best
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD bobcat
41 C AMD jaguar
42 C Intel P4
43 C Intel core
44 C Intel NHM
45 C Intel SBR 2.93 this
46 C Intel IBR 2.66 this
47 C Intel HWL 2.5 2.15
48 C Intel BWL
49 C Intel atom
50 C VIA nano
52 C This code is the result of running a code generation and optimisation tool
53 C suite written by David Harvey and Torbjorn Granlund.
55 C When playing with pointers, set this to $2 to fall back to conservative
56 C indexing in wind-down code.
57 define(`I',`$1')
60 define(`rp', `%rdi') C rcx
61 define(`up', `%rsi') C rdx
62 define(`n_param', `%rdx') C r8
63 define(`vp', `%rcx') C r9
65 define(`n', `%rcx')
66 define(`v0', `%rbx')
67 define(`v1', `%rbp')
68 define(`w0', `%r8')
69 define(`w1', `%r9')
70 define(`w2', `%r10')
71 define(`w3', `%r11')
72 define(`X0', `%r12')
73 define(`X1', `%r13')
75 ABI_SUPPORT(DOS64)
76 ABI_SUPPORT(STD64)
78 ASM_START()
79 TEXT
80 ALIGN(32)
81 PROLOGUE(mpn_addmul_2)
82 FUNC_ENTRY(4)
83 push %rbx
84 push %rbp
85 push %r12
86 push %r13
88 mov (vp), v0
89 mov 8(vp), v1
91 mov (up), %rax
93 mov n_param, n
94 neg n
96 lea (up,n_param,8), up
97 lea 8(rp,n_param,8), rp
98 mul v0
100 test $1, R8(n)
101 jnz L(bx1)
103 L(bx0): mov -8(rp,n,8), X0
104 mov %rdx, w1
105 add %rax, X0
106 adc $0, w1
107 mov (up,n,8), %rax
108 xor w0, w0
109 xor w3, w3
110 test $2, R8(n)
111 jnz L(b10)
113 L(b00): nop C this nop make loop go faster on SBR!
114 mul v1
115 mov (rp,n,8), X1
116 jmp L(lo0)
118 L(b10): lea -2(n), n
119 jmp L(lo2)
121 L(bx1): mov -8(rp,n,8), X1
122 mov %rdx, w3
123 add %rax, X1
124 adc $0, w3
125 mov (up,n,8), %rax
126 xor w1, w1
127 xor w2, w2
128 test $2, R8(n)
129 jz L(b11)
131 L(b01): mov (rp,n,8), X0
132 inc n
133 jmp L(lo1)
135 L(b11): dec n
136 jmp L(lo3)
138 ALIGN(32)
139 L(top):
140 L(lo1): mul v1
141 mov %rdx, w0 C 1
142 add %rax, X0 C 0
143 adc $0, w0 C 1
144 add w1, X1 C 3
145 adc $0, w3 C 0
146 add w2, X0 C 0
147 adc $0, w0 C 1
148 mov (up,n,8), %rax
149 mul v0
150 add %rax, X0 C 0
151 mov %rdx, w1 C 1
152 adc $0, w1 C 1
153 mov (up,n,8), %rax
154 mul v1
155 mov X1, -16(rp,n,8) C 3
156 mov (rp,n,8), X1 C 1
157 add w3, X0 C 0
158 adc $0, w1 C 1
159 L(lo0): mov %rdx, w2 C 2
160 mov X0, -8(rp,n,8) C 0
161 add %rax, X1 C 1
162 adc $0, w2 C 2
163 mov 8(up,n,8), %rax
164 add w0, X1 C 1
165 adc $0, w2 C 2
166 mul v0
167 add %rax, X1 C 1
168 mov %rdx, w3 C 2
169 adc $0, w3 C 2
170 mov 8(up,n,8), %rax
171 L(lo3): mul v1
172 add w1, X1 C 1
173 mov 8(rp,n,8), X0 C 2
174 adc $0, w3 C 2
175 mov %rdx, w0 C 3
176 add %rax, X0 C 2
177 adc $0, w0 C 3
178 mov 16(up,n,8), %rax
179 mul v0
180 add w2, X0 C 2
181 mov X1, (rp,n,8) C 1
182 mov %rdx, w1 C 3
183 adc $0, w0 C 3
184 add %rax, X0 C 2
185 adc $0, w1 C 3
186 mov 16(up,n,8), %rax
187 add w3, X0 C 2
188 adc $0, w1 C 3
189 L(lo2): mul v1
190 mov 16(rp,n,8), X1 C 3
191 add %rax, X1 C 3
192 mov %rdx, w2 C 4
193 adc $0, w2 C 4
194 mov 24(up,n,8), %rax
195 mov X0, 8(rp,n,8) C 2
196 mul v0
197 add w0, X1 C 3
198 mov %rdx, w3 C 4
199 adc $0, w2 C 4
200 add %rax, X1 C 3
201 mov 24(up,n,8), %rax
202 mov 24(rp,n,8), X0 C 0 useless but harmless final read
203 adc $0, w3 C 4
204 add $4, n
205 jnc L(top)
207 L(end): mul v1
208 add w1, X1
209 adc $0, w3
210 add w2, %rax
211 adc $0, %rdx
212 mov X1, I(-16(rp),-16(rp,n,8))
213 add w3, %rax
214 adc $0, %rdx
215 mov %rax, I(-8(rp),-8(rp,n,8))
216 mov %rdx, %rax
218 pop %r13
219 pop %r12
220 pop %rbp
221 pop %rbx
222 FUNC_EXIT()
224 EPILOGUE()