beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / core2 / mullo_basecase.asm
blob0f03d867f6b45b025801af348de2d40242354f86
1 dnl AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_2 addmul_2
36 C AMD K8,K9
37 C AMD K10
38 C AMD bull
39 C AMD pile
40 C AMD steam
41 C AMD bobcat
42 C AMD jaguar
43 C Intel P4
44 C Intel core 4.0 4.18-4.25
45 C Intel NHM 3.75 4.06-4.2
46 C Intel SBR
47 C Intel IBR
48 C Intel HWL
49 C Intel BWL
50 C Intel atom
51 C VIA nano
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
56 C TODO
57 C * Implement proper cor2, replacing current cor0.
58 C * Offset n by 2 in order to avoid the outer loop cmp. (And sqr_basecase?)
59 C * Micro-optimise.
61 C When playing with pointers, set this to $2 to fall back to conservative
62 C indexing in wind-down code.
63 define(`I',`$1')
65 define(`rp', `%rdi')
66 define(`up', `%rsi')
67 define(`vp_param', `%rdx')
68 define(`n_param', `%rcx')
70 define(`v0', `%r10')
71 define(`v1', `%r11')
72 define(`w0', `%rbx')
73 define(`w1', `%rcx')
74 define(`w2', `%rbp')
75 define(`w3', `%r12')
76 define(`n', `%r9')
77 define(`i', `%r13')
78 define(`vp', `%r8')
80 define(`X0', `%r14')
81 define(`X1', `%r15')
83 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
85 ABI_SUPPORT(DOS64)
86 ABI_SUPPORT(STD64)
88 define(`ALIGNx', `ALIGN(16)')
90 define(`N', 85)
91 ifdef(`N',,`define(`N',0)')
92 define(`MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`lea ($1), $2')')
94 ASM_START()
95 TEXT
96 ALIGN(32)
97 PROLOGUE(mpn_mullo_basecase)
98 FUNC_ENTRY(4)
100 mov (up), %rax
101 mov vp_param, vp
103 cmp $4, n_param
104 jb L(small)
106 mov (vp_param), v0
107 push %rbx
108 lea (rp,n_param,8), rp C point rp at R[un]
109 push %rbp
110 lea (up,n_param,8), up C point up right after U's end
111 push %r12
112 mov $0, R32(n) C FIXME
113 sub n_param, n
114 push %r13
115 mul v0
116 mov 8(vp), v1
118 test $1, R8(n_param)
119 jnz L(m2x1)
121 L(m2x0):test $2, R8(n_param)
122 jnz L(m2b2)
124 L(m2b0):lea (n), i
125 mov %rax, (rp,n,8)
126 mov %rdx, w1
127 mov (up,n,8), %rax
128 xor R32(w2), R32(w2)
129 jmp L(m2e0)
131 L(m2b2):lea -2(n), i
132 mov %rax, w2
133 mov (up,n,8), %rax
134 mov %rdx, w3
135 xor R32(w0), R32(w0)
136 jmp L(m2e2)
138 L(m2x1):test $2, R8(n_param)
139 jnz L(m2b3)
141 L(m2b1):lea 1(n), i
142 mov %rax, (rp,n,8)
143 mov (up,n,8), %rax
144 mov %rdx, w0
145 xor R32(w1), R32(w1)
146 jmp L(m2e1)
148 L(m2b3):lea -1(n), i
149 xor R32(w3), R32(w3)
150 mov %rax, w1
151 mov %rdx, w2
152 mov (up,n,8), %rax
153 jmp L(m2e3)
155 ALIGNx
156 L(m2tp):mul v0
157 add %rax, w3
158 mov -8(up,i,8), %rax
159 mov w3, -8(rp,i,8)
160 adc %rdx, w0
161 adc $0, R32(w1)
162 L(m2e1):mul v1
163 add %rax, w0
164 adc %rdx, w1
165 mov $0, R32(w2)
166 mov (up,i,8), %rax
167 mul v0
168 add %rax, w0
169 mov w0, (rp,i,8)
170 adc %rdx, w1
171 mov (up,i,8), %rax
172 adc $0, R32(w2)
173 L(m2e0):mul v1
174 add %rax, w1
175 adc %rdx, w2
176 mov 8(up,i,8), %rax
177 mul v0
178 mov $0, R32(w3)
179 add %rax, w1
180 adc %rdx, w2
181 adc $0, R32(w3)
182 mov 8(up,i,8), %rax
183 L(m2e3):mul v1
184 add %rax, w2
185 mov w1, 8(rp,i,8)
186 adc %rdx, w3
187 mov $0, R32(w0)
188 mov 16(up,i,8), %rax
189 mul v0
190 add %rax, w2
191 mov 16(up,i,8), %rax
192 adc %rdx, w3
193 adc $0, R32(w0)
194 L(m2e2):mul v1
195 mov $0, R32(w1) C FIXME: dead in last iteration
196 add %rax, w3
197 mov 24(up,i,8), %rax
198 mov w2, 16(rp,i,8)
199 adc %rdx, w0 C FIXME: dead in last iteration
200 add $4, i
201 js L(m2tp)
203 L(m2ed):imul v0, %rax
204 add w3, %rax
205 mov %rax, I(-8(rp),-8(rp,i,8))
207 add $2, n
208 lea 16(vp), vp
209 lea -16(up), up
210 cmp $-2, n
211 jge L(cor1)
213 push %r14
214 push %r15
216 L(outer):
217 mov (vp), v0
218 mov 8(vp), v1
219 mov (up,n,8), %rax
220 mul v0
221 test $1, R8(n)
222 jnz L(a1x1)
224 L(a1x0):mov %rax, X1
225 MOV( %rdx, X0, 8)
226 mov (up,n,8), %rax
227 mul v1
228 test $2, R8(n)
229 jnz L(a110)
231 L(a100):lea (n), i
232 mov (rp,n,8), w3
233 mov %rax, w0
234 MOV( %rdx, w1, 16)
235 jmp L(lo0)
237 L(a110):lea 2(n), i
238 mov (rp,n,8), w1
239 mov %rax, w2
240 mov 8(up,n,8), %rax
241 MOV( %rdx, w3, 1)
242 jmp L(lo2)
244 L(a1x1):mov %rax, X0
245 MOV( %rdx, X1, 2)
246 mov (up,n,8), %rax
247 mul v1
248 test $2, R8(n)
249 jz L(a111)
251 L(a101):lea 1(n), i
252 MOV( %rdx, w0, 4)
253 mov (rp,n,8), w2
254 mov %rax, w3
255 jmp L(lo1)
257 L(a111):lea -1(n), i
258 MOV( %rdx, w2, 64)
259 mov %rax, w1
260 mov (rp,n,8), w0
261 mov 8(up,n,8), %rax
262 jmp L(lo3)
264 ALIGNx
265 L(top): mul v1
266 add w0, w1
267 adc %rax, w2
268 mov -8(up,i,8), %rax
269 MOV( %rdx, w3, 1)
270 adc $0, w3
271 L(lo2): mul v0
272 add w1, X1
273 mov X1, -16(rp,i,8)
274 adc %rax, X0
275 MOV( %rdx, X1, 2)
276 adc $0, X1
277 mov -8(up,i,8), %rax
278 mul v1
279 MOV( %rdx, w0, 4)
280 mov -8(rp,i,8), w1
281 add w1, w2
282 adc %rax, w3
283 adc $0, w0
284 L(lo1): mov (up,i,8), %rax
285 mul v0
286 add w2, X0
287 adc %rax, X1
288 mov X0, -8(rp,i,8)
289 MOV( %rdx, X0, 8)
290 adc $0, X0
291 mov (up,i,8), %rax
292 mov (rp,i,8), w2
293 mul v1
294 add w2, w3
295 adc %rax, w0
296 MOV( %rdx, w1, 16)
297 adc $0, w1
298 L(lo0): mov 8(up,i,8), %rax
299 mul v0
300 add w3, X1
301 mov X1, (rp,i,8)
302 adc %rax, X0
303 MOV( %rdx, X1, 32)
304 mov 8(rp,i,8), w3
305 adc $0, X1
306 mov 8(up,i,8), %rax
307 mul v1
308 add w3, w0
309 MOV( %rdx, w2, 64)
310 adc %rax, w1
311 mov 16(up,i,8), %rax
312 adc $0, w2
313 L(lo3): mul v0
314 add w0, X0
315 mov X0, 8(rp,i,8)
316 MOV( %rdx, X0, 128)
317 adc %rax, X1
318 mov 16(up,i,8), %rax
319 mov 16(rp,i,8), w0
320 adc $0, X0
321 add $4, i
322 jnc L(top)
324 L(end): imul v1, %rax
325 add w0, w1
326 adc %rax, w2
327 mov I(-8(up),-8(up,i,8)), %rax
328 imul v0, %rax
329 add w1, X1
330 mov X1, I(-16(rp),-16(rp,i,8))
331 adc X0, %rax
332 mov I(-8(rp),-8(rp,i,8)), w1
333 add w1, w2
334 add w2, %rax
335 mov %rax, I(-8(rp),-8(rp,i,8))
337 add $2, n
338 lea 16(vp), vp
339 lea -16(up), up
340 cmp $-2, n
341 jl L(outer)
343 pop %r15
344 pop %r14
346 jnz L(cor0)
348 L(cor1):mov (vp), v0
349 mov 8(vp), v1
350 mov -16(up), %rax
351 mul v0 C u0 x v2
352 add -16(rp), %rax C FIXME: rp[0] still available in reg?
353 adc -8(rp), %rdx C FIXME: rp[1] still available in reg?
354 mov -8(up), %rbx
355 imul v0, %rbx
356 mov -16(up), %rcx
357 imul v1, %rcx
358 mov %rax, -16(rp)
359 add %rbx, %rcx
360 add %rdx, %rcx
361 mov %rcx, -8(rp)
362 pop %r13
363 pop %r12
364 pop %rbp
365 pop %rbx
366 FUNC_EXIT()
369 L(cor0):mov (vp), %r11
370 imul -8(up), %r11
371 add %rax, %r11
372 mov %r11, -8(rp)
373 pop %r13
374 pop %r12
375 pop %rbp
376 pop %rbx
377 FUNC_EXIT()
380 ALIGN(16)
381 L(small):
382 cmp $2, n_param
383 jae L(gt1)
384 L(n1): imul (vp_param), %rax
385 mov %rax, (rp)
386 FUNC_EXIT()
388 L(gt1): ja L(gt2)
389 L(n2): mov (vp_param), %r9
390 mul %r9
391 mov %rax, (rp)
392 mov 8(up), %rax
393 imul %r9, %rax
394 add %rax, %rdx
395 mov 8(vp), %r9
396 mov (up), %rcx
397 imul %r9, %rcx
398 add %rcx, %rdx
399 mov %rdx, 8(rp)
400 FUNC_EXIT()
402 L(gt2):
403 L(n3): mov (vp_param), %r9
404 mul %r9 C u0 x v0
405 mov %rax, (rp)
406 mov %rdx, %r10
407 mov 8(up), %rax
408 mul %r9 C u1 x v0
409 imul 16(up), %r9 C u2 x v0
410 add %rax, %r10
411 adc %rdx, %r9
412 mov 8(vp), %r11
413 mov (up), %rax
414 mul %r11 C u0 x v1
415 add %rax, %r10
416 adc %rdx, %r9
417 imul 8(up), %r11 C u1 x v1
418 add %r11, %r9
419 mov %r10, 8(rp)
420 mov 16(vp), %r10
421 mov (up), %rax
422 imul %rax, %r10 C u0 x v2
423 add %r10, %r9
424 mov %r9, 16(rp)
425 FUNC_EXIT()
427 EPILOGUE()