beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / k8 / mul_basecase.asm
blobca2efb9b2f0133950fe8ba6a4b6df677b1090662
1 dnl AMD64 mpn_mul_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
5 dnl Copyright 2008, 2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C AMD K8,K9 2.375
37 C AMD K10 2.375
38 C Intel P4 15-16
39 C Intel core2 4.45
40 C Intel corei 4.35
41 C Intel atom ?
42 C VIA nano 4.5
44 C The inner loops of this code are the result of running a code generation and
45 C optimization tool suite written by David Harvey and Torbjorn Granlund.
47 C TODO
48 C * Use fewer registers. (how??? I can't see it -- david)
49 C * Avoid some "mov $0,r" and instead use "xor r,r".
50 C * Can the top of each L(addmul_outer_n) prologue be folded into the
51 C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
52 C case where vn = 1 or 2; is it worth it?
54 C INPUT PARAMETERS
55 define(`rp', `%rdi')
56 define(`up', `%rsi')
57 define(`un_param',`%rdx')
58 define(`vp', `%rcx')
59 define(`vn', `%r8')
61 define(`v0', `%r12')
62 define(`v1', `%r9')
64 define(`w0', `%rbx')
65 define(`w1', `%r15')
66 define(`w2', `%rbp')
67 define(`w3', `%r10')
69 define(`n', `%r11')
70 define(`outer_addr', `%r14')
71 define(`un', `%r13')
73 ABI_SUPPORT(DOS64)
74 ABI_SUPPORT(STD64)
76 ASM_START()
77 TEXT
78 ALIGN(16)
79 PROLOGUE(mpn_mul_basecase)
80 FUNC_ENTRY(4)
81 IFDOS(` mov 56(%rsp), %r8d ')
82 push %rbx
83 push %rbp
84 push %r12
85 push %r13
86 push %r14
87 push %r15
89 xor R32(un), R32(un)
90 mov (up), %rax
91 mov (vp), v0
93 sub un_param, un C rdx used by mul
94 mov un, n
95 mov R32(un_param), R32(w0)
97 lea (rp,un_param,8), rp
98 lea (up,un_param,8), up
100 mul v0
102 test $1, R8(vn)
103 jz L(mul_2)
105 C ===========================================================
106 C mul_1 for vp[0] if vn is odd
108 L(mul_1):
109 and $3, R32(w0)
110 jz L(mul_1_prologue_0)
111 cmp $2, R32(w0)
112 jc L(mul_1_prologue_1)
113 jz L(mul_1_prologue_2)
115 L(mul_1_prologue_3):
116 add $-1, n
117 lea L(addmul_outer_3)(%rip), outer_addr
118 mov %rax, w3
119 mov %rdx, w0
120 jmp L(mul_1_entry_3)
122 L(mul_1_prologue_0):
123 mov %rax, w2
124 mov %rdx, w3 C note: already w0 == 0
125 lea L(addmul_outer_0)(%rip), outer_addr
126 jmp L(mul_1_entry_0)
128 L(mul_1_prologue_1):
129 cmp $-1, un
130 jne 2f
131 mov %rax, -8(rp)
132 mov %rdx, (rp)
133 jmp L(ret)
134 2: add $1, n
135 lea L(addmul_outer_1)(%rip), outer_addr
136 mov %rax, w1
137 mov %rdx, w2
138 xor R32(w3), R32(w3)
139 mov (up,n,8), %rax
140 jmp L(mul_1_entry_1)
142 L(mul_1_prologue_2):
143 add $-2, n
144 lea L(addmul_outer_2)(%rip), outer_addr
145 mov %rax, w0
146 mov %rdx, w1
147 mov 24(up,n,8), %rax
148 xor R32(w2), R32(w2)
149 xor R32(w3), R32(w3)
150 jmp L(mul_1_entry_2)
153 C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
155 ALIGN(16)
156 L(mul_1_top):
157 mov w0, -16(rp,n,8)
158 add %rax, w1
159 mov (up,n,8), %rax
160 adc %rdx, w2
161 L(mul_1_entry_1):
162 xor R32(w0), R32(w0)
163 mul v0
164 mov w1, -8(rp,n,8)
165 add %rax, w2
166 adc %rdx, w3
167 L(mul_1_entry_0):
168 mov 8(up,n,8), %rax
169 mul v0
170 mov w2, (rp,n,8)
171 add %rax, w3
172 adc %rdx, w0
173 L(mul_1_entry_3):
174 mov 16(up,n,8), %rax
175 mul v0
176 mov w3, 8(rp,n,8)
177 xor R32(w2), R32(w2) C zero
178 mov w2, w3 C zero
179 add %rax, w0
180 mov 24(up,n,8), %rax
181 mov w2, w1 C zero
182 adc %rdx, w1
183 L(mul_1_entry_2):
184 mul v0
185 add $4, n
186 js L(mul_1_top)
188 mov w0, -16(rp)
189 add %rax, w1
190 mov w1, -8(rp)
191 adc %rdx, w2
192 mov w2, (rp)
194 add $-1, vn C vn -= 1
195 jz L(ret)
197 mov 8(vp), v0
198 mov 16(vp), v1
200 lea 8(vp), vp C vp += 1
201 lea 8(rp), rp C rp += 1
203 jmp *outer_addr
205 C ===========================================================
206 C mul_2 for vp[0], vp[1] if vn is even
208 ALIGN(16)
209 L(mul_2):
210 mov 8(vp), v1
212 and $3, R32(w0)
213 jz L(mul_2_prologue_0)
214 cmp $2, R32(w0)
215 jz L(mul_2_prologue_2)
216 jc L(mul_2_prologue_1)
218 L(mul_2_prologue_3):
219 lea L(addmul_outer_3)(%rip), outer_addr
220 add $2, n
221 mov %rax, -16(rp,n,8)
222 mov %rdx, w2
223 xor R32(w3), R32(w3)
224 xor R32(w0), R32(w0)
225 mov -16(up,n,8), %rax
226 jmp L(mul_2_entry_3)
228 ALIGN(16)
229 L(mul_2_prologue_0):
230 add $3, n
231 mov %rax, w0
232 mov %rdx, w1
233 xor R32(w2), R32(w2)
234 mov -24(up,n,8), %rax
235 lea L(addmul_outer_0)(%rip), outer_addr
236 jmp L(mul_2_entry_0)
238 ALIGN(16)
239 L(mul_2_prologue_1):
240 mov %rax, w3
241 mov %rdx, w0
242 xor R32(w1), R32(w1)
243 lea L(addmul_outer_1)(%rip), outer_addr
244 jmp L(mul_2_entry_1)
246 ALIGN(16)
247 L(mul_2_prologue_2):
248 add $1, n
249 lea L(addmul_outer_2)(%rip), outer_addr
250 mov $0, R32(w0)
251 mov $0, R32(w1)
252 mov %rax, w2
253 mov -8(up,n,8), %rax
254 mov %rdx, w3
255 jmp L(mul_2_entry_2)
257 C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
259 ALIGN(16)
260 L(mul_2_top):
261 mov -32(up,n,8), %rax
262 mul v1
263 add %rax, w0
264 adc %rdx, w1
265 mov -24(up,n,8), %rax
266 xor R32(w2), R32(w2)
267 mul v0
268 add %rax, w0
269 mov -24(up,n,8), %rax
270 adc %rdx, w1
271 adc $0, R32(w2)
272 L(mul_2_entry_0):
273 mul v1
274 add %rax, w1
275 mov w0, -24(rp,n,8)
276 adc %rdx, w2
277 mov -16(up,n,8), %rax
278 mul v0
279 mov $0, R32(w3)
280 add %rax, w1
281 adc %rdx, w2
282 mov -16(up,n,8), %rax
283 adc $0, R32(w3)
284 mov $0, R32(w0)
285 mov w1, -16(rp,n,8)
286 L(mul_2_entry_3):
287 mul v1
288 add %rax, w2
289 mov -8(up,n,8), %rax
290 adc %rdx, w3
291 mov $0, R32(w1)
292 mul v0
293 add %rax, w2
294 mov -8(up,n,8), %rax
295 adc %rdx, w3
296 adc R32(w1), R32(w0) C adc $0, w0
297 L(mul_2_entry_2):
298 mul v1
299 add %rax, w3
300 mov w2, -8(rp,n,8)
301 adc %rdx, w0
302 mov (up,n,8), %rax
303 mul v0
304 add %rax, w3
305 adc %rdx, w0
306 adc $0, R32(w1)
307 L(mul_2_entry_1):
308 add $4, n
309 mov w3, -32(rp,n,8)
310 js L(mul_2_top)
312 mov -32(up,n,8), %rax C FIXME: n is constant
313 mul v1
314 add %rax, w0
315 mov w0, (rp)
316 adc %rdx, w1
317 mov w1, 8(rp)
319 add $-2, vn C vn -= 2
320 jz L(ret)
322 mov 16(vp), v0
323 mov 24(vp), v1
325 lea 16(vp), vp C vp += 2
326 lea 16(rp), rp C rp += 2
328 jmp *outer_addr
331 C ===========================================================
332 C addmul_2 for remaining vp's
334 C in the following prologues, we reuse un to store the
335 C adjusted value of n that is reloaded on each iteration
337 L(addmul_outer_0):
338 add $3, un
339 lea 0(%rip), outer_addr
341 mov un, n
342 mov -24(up,un,8), %rax
343 mul v0
344 mov %rax, w0
345 mov -24(up,un,8), %rax
346 mov %rdx, w1
347 xor R32(w2), R32(w2)
348 jmp L(addmul_entry_0)
350 L(addmul_outer_1):
351 mov un, n
352 mov (up,un,8), %rax
353 mul v0
354 mov %rax, w3
355 mov (up,un,8), %rax
356 mov %rdx, w0
357 xor R32(w1), R32(w1)
358 jmp L(addmul_entry_1)
360 L(addmul_outer_2):
361 add $1, un
362 lea 0(%rip), outer_addr
364 mov un, n
365 mov -8(up,un,8), %rax
366 mul v0
367 xor R32(w0), R32(w0)
368 mov %rax, w2
369 xor R32(w1), R32(w1)
370 mov %rdx, w3
371 mov -8(up,un,8), %rax
372 jmp L(addmul_entry_2)
374 L(addmul_outer_3):
375 add $2, un
376 lea 0(%rip), outer_addr
378 mov un, n
379 mov -16(up,un,8), %rax
380 xor R32(w3), R32(w3)
381 mul v0
382 mov %rax, w1
383 mov -16(up,un,8), %rax
384 mov %rdx, w2
385 jmp L(addmul_entry_3)
387 C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
389 ALIGN(16)
390 L(addmul_top):
391 add w3, -32(rp,n,8)
392 adc %rax, w0
393 mov -24(up,n,8), %rax
394 adc %rdx, w1
395 xor R32(w2), R32(w2)
396 mul v0
397 add %rax, w0
398 mov -24(up,n,8), %rax
399 adc %rdx, w1
400 adc R32(w2), R32(w2) C adc $0, w2
401 L(addmul_entry_0):
402 mul v1
403 xor R32(w3), R32(w3)
404 add w0, -24(rp,n,8)
405 adc %rax, w1
406 mov -16(up,n,8), %rax
407 adc %rdx, w2
408 mul v0
409 add %rax, w1
410 mov -16(up,n,8), %rax
411 adc %rdx, w2
412 adc $0, R32(w3)
413 L(addmul_entry_3):
414 mul v1
415 add w1, -16(rp,n,8)
416 adc %rax, w2
417 mov -8(up,n,8), %rax
418 adc %rdx, w3
419 mul v0
420 xor R32(w0), R32(w0)
421 add %rax, w2
422 adc %rdx, w3
423 mov $0, R32(w1)
424 mov -8(up,n,8), %rax
425 adc R32(w1), R32(w0) C adc $0, w0
426 L(addmul_entry_2):
427 mul v1
428 add w2, -8(rp,n,8)
429 adc %rax, w3
430 adc %rdx, w0
431 mov (up,n,8), %rax
432 mul v0
433 add %rax, w3
434 mov (up,n,8), %rax
435 adc %rdx, w0
436 adc $0, R32(w1)
437 L(addmul_entry_1):
438 mul v1
439 add $4, n
440 js L(addmul_top)
442 add w3, -8(rp)
443 adc %rax, w0
444 mov w0, (rp)
445 adc %rdx, w1
446 mov w1, 8(rp)
448 add $-2, vn C vn -= 2
449 jz L(ret)
451 lea 16(rp), rp C rp += 2
452 lea 16(vp), vp C vp += 2
454 mov (vp), v0
455 mov 8(vp), v1
457 jmp *outer_addr
459 ALIGN(16)
460 L(ret): pop %r15
461 pop %r14
462 pop %r13
463 pop %r12
464 pop %rbp
465 pop %rbx
466 FUNC_EXIT()
469 EPILOGUE()