beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / coreisbr / mullo_basecase.asm
bloba41a8acee415af610cc76dab2d6967b010088eb0
1 dnl AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_2 addmul_2
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD steam
41 C AMD bobcat
42 C AMD jaguar
43 C Intel P4
44 C Intel core
45 C Intel NHM
46 C Intel SBR 2.5 2.95
47 C Intel IBR 2.3 2.68
48 C Intel HWL 2.0 2.5
49 C Intel BWL
50 C Intel atom
51 C VIA nano
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Implement proper cor2, replacing current cor0.
58 C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
59 C * Micro-optimise.
61 C When playing with pointers, set this to $2 to fall back to conservative
62 C indexing in wind-down code.
63 define(`I',`$1')
65 define(`rp', `%rdi')
66 define(`up', `%rsi')
67 define(`vp_param', `%rdx')
68 define(`n', `%rcx')
70 define(`vp', `%r8')
71 define(`X0', `%r14')
72 define(`X1', `%r15')
74 define(`w0', `%r10')
75 define(`w1', `%r11')
76 define(`w2', `%r12')
77 define(`w3', `%r13')
78 define(`i', `%rbp')
79 define(`v0', `%r9')
80 define(`v1', `%rbx')
82 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
84 ABI_SUPPORT(DOS64)
85 ABI_SUPPORT(STD64)
87 ASM_START()
88 TEXT
89 ALIGN(32)
90 PROLOGUE(mpn_mullo_basecase)
91 FUNC_ENTRY(4)
93 mov (up), %rax
94 mov vp_param, vp
96 cmp $4, n
97 jb L(small)
99 mov (vp_param), v0
100 push %rbx
101 lea (rp,n,8), rp C point rp at R[un]
102 push %rbp
103 lea (up,n,8), up C point up right after U's end
104 push %r12
105 neg n
106 push %r13
107 mul v0
108 mov 8(vp), v1
110 test $1, R8(n)
111 jnz L(m2b1)
113 L(m2b0):lea (n), i
114 xor w0, w0
115 mov %rax, w2
116 mov %rdx, w1
117 jmp L(m2l0)
119 L(m2b1):lea 1(n), i
120 xor w1, w1
121 xor w2, w2
122 mov %rax, w0
123 mov %rdx, w3
124 jmp L(m2l1)
126 ALIGN(32)
127 L(m2tp):mul v0
128 add %rax, w0
129 mov %rdx, w3
130 adc $0, w3
131 L(m2l1):mov -8(up,i,8), %rax
132 mul v1
133 add w1, w0
134 adc $0, w3
135 add %rax, w2
136 mov w0, -8(rp,i,8)
137 mov %rdx, w0
138 adc $0, w0
139 mov (up,i,8), %rax
140 mul v0
141 add %rax, w2
142 mov %rdx, w1
143 adc $0, w1
144 add w3, w2
145 L(m2l0):mov (up,i,8), %rax
146 adc $0, w1
147 mul v1
148 mov w2, (rp,i,8)
149 add %rax, w0
150 mov %rdx, w2 C FIXME: dead in last iteration
151 mov 8(up,i,8), %rax
152 adc $0, w2 C FIXME: dead in last iteration
153 add $2, i
154 jnc L(m2tp)
156 L(m2ed):imul v0, %rax
157 add w0, %rax
158 add w1, %rax
159 mov %rax, I(-8(rp),-8(rp,i,8))
161 add $2, n
162 lea 16(vp), vp
163 lea -16(up), up
164 cmp $-2, n
165 jge L(cor1)
167 push %r14
168 push %r15
170 L(outer):
171 mov (vp), v0
172 mov 8(vp), v1
173 mov (up,n,8), %rax
174 mul v0
175 test $1, R8(n)
176 jnz L(a1x1)
178 L(a1x0):mov (rp,n,8), X1
179 xor w2, w2
180 xor w1, w1
181 test $2, R8(n)
182 jnz L(a110)
184 L(a100):lea 1(n), i
185 jmp L(lo0)
187 L(a110):lea 3(n), i
188 mov %rdx, w3
189 add %rax, X1
190 mov (up,n,8), %rax
191 mov 8(rp,n,8), X0
192 adc $0, w3
193 jmp L(lo2)
195 L(a1x1):mov (rp,n,8), X0
196 xor w0, w0
197 mov %rdx, w1
198 test $2, R8(n)
199 jz L(a111)
201 L(a101):lea 2(n), i
202 add %rax, X0
203 adc $0, w1
204 mov (up,n,8), %rax
205 mul v1
206 mov 8(rp,n,8), X1
207 jmp L(lo1)
209 L(a111):lea (n), i
210 xor w3, w3
211 jmp L(lo3)
213 ALIGN(32)
214 L(top):
215 L(lo2): mul v1
216 mov %rdx, w0
217 add %rax, X0
218 adc $0, w0
219 add w1, X1
220 adc $0, w3
221 add w2, X0
222 adc $0, w0
223 mov -16(up,i,8), %rax
224 mul v0
225 add %rax, X0
226 mov %rdx, w1
227 adc $0, w1
228 mov -16(up,i,8), %rax
229 mul v1
230 mov X1, -24(rp,i,8)
231 mov -8(rp,i,8), X1
232 add w3, X0
233 adc $0, w1
234 L(lo1): mov %rdx, w2
235 mov X0, -16(rp,i,8)
236 add %rax, X1
237 adc $0, w2
238 mov -8(up,i,8), %rax
239 add w0, X1
240 adc $0, w2
241 mul v0
242 L(lo0): add %rax, X1
243 mov %rdx, w3
244 adc $0, w3
245 mov -8(up,i,8), %rax
246 mul v1
247 add w1, X1
248 mov (rp,i,8), X0
249 adc $0, w3
250 mov %rdx, w0
251 add %rax, X0
252 adc $0, w0
253 mov (up,i,8), %rax
254 mul v0
255 add w2, X0
256 mov X1, -8(rp,i,8)
257 mov %rdx, w1
258 adc $0, w0
259 L(lo3): add %rax, X0
260 adc $0, w1
261 mov (up,i,8), %rax
262 add w3, X0
263 adc $0, w1
264 mul v1
265 mov 8(rp,i,8), X1
266 add %rax, X1
267 mov %rdx, w2
268 adc $0, w2
269 mov 8(up,i,8), %rax
270 mov X0, (rp,i,8)
271 mul v0
272 add w0, X1
273 mov %rdx, w3
274 adc $0, w2
275 add %rax, X1
276 mov 8(up,i,8), %rax
277 mov 16(rp,i,8), X0
278 adc $0, w3
279 add $4, i
280 jnc L(top)
282 L(end): imul v1, %rax
283 add %rax, X0
284 add w1, X1
285 adc $0, w3
286 add w2, X0
287 mov I(-8(up),-16(up,i,8)), %rax
288 imul v0, %rax
289 add X0, %rax
290 mov X1, I(-16(rp),-24(rp,i,8))
291 add w3, %rax
292 mov %rax, I(-8(rp),-16(rp,i,8))
294 add $2, n
295 lea 16(vp), vp
296 lea -16(up), up
297 cmp $-2, n
298 jl L(outer)
300 pop %r15
301 pop %r14
303 jnz L(cor0)
305 L(cor1):mov (vp), v0
306 mov 8(vp), v1
307 mov -16(up), %rax
308 mul v0 C u0 x v2
309 add -16(rp), %rax C FIXME: rp[0] still available in reg?
310 adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
311 mov -8(up), %r10
312 imul v0, %r10
313 mov -16(up), %r11
314 imul v1, %r11
315 mov %rax, -16(rp)
316 add %r10, %r11
317 add %rdx, %r11
318 mov %r11, -8(rp)
319 pop %r13
320 pop %r12
321 pop %rbp
322 pop %rbx
323 FUNC_EXIT()
326 L(cor0):mov (vp), %r11
327 imul -8(up), %r11
328 add %rax, %r11
329 mov %r11, -8(rp)
330 pop %r13
331 pop %r12
332 pop %rbp
333 pop %rbx
334 FUNC_EXIT()
337 ALIGN(16)
338 L(small):
339 cmp $2, n
340 jae L(gt1)
341 L(n1): imul (vp_param), %rax
342 mov %rax, (rp)
343 FUNC_EXIT()
345 L(gt1): ja L(gt2)
346 L(n2): mov (vp_param), %r9
347 mul %r9
348 mov %rax, (rp)
349 mov 8(up), %rax
350 imul %r9, %rax
351 add %rax, %rdx
352 mov 8(vp), %r9
353 mov (up), %rcx
354 imul %r9, %rcx
355 add %rcx, %rdx
356 mov %rdx, 8(rp)
357 FUNC_EXIT()
359 L(gt2):
360 L(n3): mov (vp_param), %r9
361 mul %r9 C u0 x v0
362 mov %rax, (rp)
363 mov %rdx, %r10
364 mov 8(up), %rax
365 mul %r9 C u1 x v0
366 imul 16(up), %r9 C u2 x v0
367 add %rax, %r10
368 adc %rdx, %r9
369 mov 8(vp), %r11
370 mov (up), %rax
371 mul %r11 C u0 x v1
372 add %rax, %r10
373 adc %rdx, %r9
374 imul 8(up), %r11 C u1 x v1
375 add %r11, %r9
376 mov %r10, 8(rp)
377 mov 16(vp), %r10
378 mov (up), %rax
379 imul %rax, %r10 C u0 x v2
380 add %r10, %r9
381 mov %r9, 16(rp)
382 FUNC_EXIT()
384 EPILOGUE()