beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / k8 / mulmid_basecase.asm
blob86f1414ed850e92eff66aa30d2dfeeb67f3e221c
1 dnl AMD64 mpn_mulmid_basecase
3 dnl Contributed by David Harvey.
5 dnl Copyright 2011, 2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb
37 C K8,K9: 2.375 (2.5 when un - vn is "small")
38 C K10: ?
39 C P4: ?
40 C P6-15: ?
42 C INPUT PARAMETERS
43 define(`rp', `%rdi')
44 define(`up', `%rsi')
45 define(`un_param',`%rdx')
46 define(`vp_param',`%rcx')
47 define(`vn', `%r8')
49 define(`v0', `%r12')
50 define(`v1', `%r9')
52 define(`w0', `%rbx')
53 define(`w1', `%rcx')
54 define(`w2', `%rbp')
55 define(`w3', `%r10')
57 define(`n', `%r11')
58 define(`outer_addr', `%r14')
59 define(`un', `%r13')
60 define(`vp', `%r15')
62 define(`vp_inner', `%r10')
64 ABI_SUPPORT(DOS64)
65 ABI_SUPPORT(STD64)
67 ASM_START()
68 TEXT
69 ALIGN(16)
70 PROLOGUE(mpn_mulmid_basecase)
71 FUNC_ENTRY(4)
72 IFDOS(` mov 56(%rsp), %r8d ')
73 push %rbx
74 push %rbp
75 push %r12
76 push %r13
77 push %r14
78 push %r15
80 mov vp_param, vp
82 C use un for row length (= un_param - vn + 1)
83 lea 1(un_param), un
84 sub vn, un
86 lea (rp,un,8), rp
88 cmp $4, un C TODO: needs tuning
89 jc L(diagonal)
91 lea (up,un_param,8), up
93 test $1, vn
94 jz L(mul_2)
96 C ===========================================================
97 C mul_1 for vp[0] if vn is odd
99 L(mul_1):
100 mov R32(un), R32(w0)
102 neg un
103 mov (up,un,8), %rax
104 mov (vp), v0
105 mul v0
107 and $-4, un C round down to multiple of 4
108 mov un, n
110 and $3, R32(w0)
111 jz L(mul_1_prologue_0)
112 cmp $2, R32(w0)
113 jc L(mul_1_prologue_1)
114 jz L(mul_1_prologue_2)
116 L(mul_1_prologue_3):
117 mov %rax, w3
118 mov %rdx, w0
119 lea L(addmul_prologue_3)(%rip), outer_addr
120 jmp L(mul_1_entry_3)
122 ALIGN(16)
123 L(mul_1_prologue_0):
124 mov %rax, w2
125 mov %rdx, w3 C note already w0 == 0
126 lea L(addmul_prologue_0)(%rip), outer_addr
127 jmp L(mul_1_entry_0)
129 ALIGN(16)
130 L(mul_1_prologue_1):
131 add $4, n
132 mov %rax, w1
133 mov %rdx, w2
134 mov $0, R32(w3)
135 mov (up,n,8), %rax
136 lea L(addmul_prologue_1)(%rip), outer_addr
137 jmp L(mul_1_entry_1)
139 ALIGN(16)
140 L(mul_1_prologue_2):
141 mov %rax, w0
142 mov %rdx, w1
143 mov 24(up,n,8), %rax
144 mov $0, R32(w2)
145 mov $0, R32(w3)
146 lea L(addmul_prologue_2)(%rip), outer_addr
147 jmp L(mul_1_entry_2)
150 C this loop is 10 c/loop = 2.5 c/l on K8
152 ALIGN(16)
153 L(mul_1_top):
154 mov w0, -16(rp,n,8)
155 add %rax, w1
156 mov (up,n,8), %rax
157 adc %rdx, w2
158 L(mul_1_entry_1):
159 mov $0, R32(w0)
160 mul v0
161 mov w1, -8(rp,n,8)
162 add %rax, w2
163 adc %rdx, w3
164 L(mul_1_entry_0):
165 mov 8(up,n,8), %rax
166 mul v0
167 mov w2, (rp,n,8)
168 add %rax, w3
169 adc %rdx, w0
170 L(mul_1_entry_3):
171 mov 16(up,n,8), %rax
172 mul v0
173 mov w3, 8(rp,n,8)
174 mov $0, R32(w2) C zero
175 mov w2, w3 C zero
176 add %rax, w0
177 mov 24(up,n,8), %rax
178 mov w2, w1 C zero
179 adc %rdx, w1
180 L(mul_1_entry_2):
181 mul v0
182 add $4, n
183 js L(mul_1_top)
185 mov w0, -16(rp)
186 add %rax, w1
187 mov w1, -8(rp)
188 mov w2, 8(rp) C zero last limb of output
189 adc %rdx, w2
190 mov w2, (rp)
192 dec vn
193 jz L(ret)
195 lea -8(up), up
196 lea 8(vp), vp
198 mov un, n
199 mov (vp), v0
200 mov 8(vp), v1
202 jmp *outer_addr
204 C ===========================================================
205 C mul_2 for vp[0], vp[1] if vn is even
207 ALIGN(16)
208 L(mul_2):
209 mov R32(un), R32(w0)
211 neg un
212 mov -8(up,un,8), %rax
213 mov (vp), v0
214 mov 8(vp), v1
215 mul v1
217 and $-4, un C round down to multiple of 4
218 mov un, n
220 and $3, R32(w0)
221 jz L(mul_2_prologue_0)
222 cmp $2, R32(w0)
223 jc L(mul_2_prologue_1)
224 jz L(mul_2_prologue_2)
226 L(mul_2_prologue_3):
227 mov %rax, w1
228 mov %rdx, w2
229 lea L(addmul_prologue_3)(%rip), outer_addr
230 jmp L(mul_2_entry_3)
232 ALIGN(16)
233 L(mul_2_prologue_0):
234 mov %rax, w0
235 mov %rdx, w1
236 lea L(addmul_prologue_0)(%rip), outer_addr
237 jmp L(mul_2_entry_0)
239 ALIGN(16)
240 L(mul_2_prologue_1):
241 mov %rax, w3
242 mov %rdx, w0
243 mov $0, R32(w1)
244 lea L(addmul_prologue_1)(%rip), outer_addr
245 jmp L(mul_2_entry_1)
247 ALIGN(16)
248 L(mul_2_prologue_2):
249 mov %rax, w2
250 mov %rdx, w3
251 mov $0, R32(w0)
252 mov 16(up,n,8), %rax
253 lea L(addmul_prologue_2)(%rip), outer_addr
254 jmp L(mul_2_entry_2)
257 C this loop is 18 c/loop = 2.25 c/l on K8
259 ALIGN(16)
260 L(mul_2_top):
261 mov -8(up,n,8), %rax
262 mul v1
263 add %rax, w0
264 adc %rdx, w1
265 L(mul_2_entry_0):
266 mov $0, R32(w2)
267 mov (up,n,8), %rax
268 mul v0
269 add %rax, w0
270 mov (up,n,8), %rax
271 adc %rdx, w1
272 adc $0, R32(w2)
273 mul v1
274 add %rax, w1
275 mov w0, (rp,n,8)
276 adc %rdx, w2
277 L(mul_2_entry_3):
278 mov 8(up,n,8), %rax
279 mul v0
280 mov $0, R32(w3)
281 add %rax, w1
282 adc %rdx, w2
283 mov $0, R32(w0)
284 adc $0, R32(w3)
285 mov 8(up,n,8), %rax
286 mov w1, 8(rp,n,8)
287 mul v1
288 add %rax, w2
289 mov 16(up,n,8), %rax
290 adc %rdx, w3
291 L(mul_2_entry_2):
292 mov $0, R32(w1)
293 mul v0
294 add %rax, w2
295 mov 16(up,n,8), %rax
296 adc %rdx, w3
297 adc $0, R32(w0)
298 mul v1
299 add %rax, w3
300 mov w2, 16(rp,n,8)
301 adc %rdx, w0
302 L(mul_2_entry_1):
303 mov 24(up,n,8), %rax
304 mul v0
305 add %rax, w3
306 adc %rdx, w0
307 adc $0, R32(w1)
308 add $4, n
309 mov w3, -8(rp,n,8)
310 jnz L(mul_2_top)
312 mov w0, (rp)
313 mov w1, 8(rp)
315 sub $2, vn
316 jz L(ret)
318 lea 16(vp), vp
319 lea -16(up), up
321 mov un, n
322 mov (vp), v0
323 mov 8(vp), v1
325 jmp *outer_addr
327 C ===========================================================
328 C addmul_2 for remaining vp's
330 ALIGN(16)
331 L(addmul_prologue_0):
332 mov -8(up,n,8), %rax
333 mul v1
334 mov %rax, w1
335 mov %rdx, w2
336 mov $0, R32(w3)
337 jmp L(addmul_entry_0)
339 ALIGN(16)
340 L(addmul_prologue_1):
341 mov 16(up,n,8), %rax
342 mul v1
343 mov %rax, w0
344 mov %rdx, w1
345 mov $0, R32(w2)
346 mov 24(up,n,8), %rax
347 jmp L(addmul_entry_1)
349 ALIGN(16)
350 L(addmul_prologue_2):
351 mov 8(up,n,8), %rax
352 mul v1
353 mov %rax, w3
354 mov %rdx, w0
355 mov $0, R32(w1)
356 jmp L(addmul_entry_2)
358 ALIGN(16)
359 L(addmul_prologue_3):
360 mov (up,n,8), %rax
361 mul v1
362 mov %rax, w2
363 mov %rdx, w3
364 mov $0, R32(w0)
365 mov $0, R32(w1)
366 jmp L(addmul_entry_3)
368 C this loop is 19 c/loop = 2.375 c/l on K8
370 ALIGN(16)
371 L(addmul_top):
372 mov $0, R32(w3)
373 add %rax, w0
374 mov -8(up,n,8), %rax
375 adc %rdx, w1
376 adc $0, R32(w2)
377 mul v1
378 add w0, -8(rp,n,8)
379 adc %rax, w1
380 adc %rdx, w2
381 L(addmul_entry_0):
382 mov (up,n,8), %rax
383 mul v0
384 add %rax, w1
385 mov (up,n,8), %rax
386 adc %rdx, w2
387 adc $0, R32(w3)
388 mul v1
389 add w1, (rp,n,8)
390 mov $0, R32(w1)
391 adc %rax, w2
392 mov $0, R32(w0)
393 adc %rdx, w3
394 L(addmul_entry_3):
395 mov 8(up,n,8), %rax
396 mul v0
397 add %rax, w2
398 mov 8(up,n,8), %rax
399 adc %rdx, w3
400 adc $0, R32(w0)
401 mul v1
402 add w2, 8(rp,n,8)
403 adc %rax, w3
404 adc %rdx, w0
405 L(addmul_entry_2):
406 mov 16(up,n,8), %rax
407 mul v0
408 add %rax, w3
409 mov 16(up,n,8), %rax
410 adc %rdx, w0
411 adc $0, R32(w1)
412 mul v1
413 add w3, 16(rp,n,8)
414 nop C don't ask...
415 adc %rax, w0
416 mov $0, R32(w2)
417 mov 24(up,n,8), %rax
418 adc %rdx, w1
419 L(addmul_entry_1):
420 mul v0
421 add $4, n
422 jnz L(addmul_top)
424 add %rax, w0
425 adc %rdx, w1
426 adc $0, R32(w2)
428 add w0, -8(rp)
429 adc w1, (rp)
430 adc w2, 8(rp)
432 sub $2, vn
433 jz L(ret)
435 lea 16(vp), vp
436 lea -16(up), up
438 mov un, n
439 mov (vp), v0
440 mov 8(vp), v1
442 jmp *outer_addr
444 C ===========================================================
445 C accumulate along diagonals if un - vn is small
447 ALIGN(16)
448 L(diagonal):
449 xor R32(w0), R32(w0)
450 xor R32(w1), R32(w1)
451 xor R32(w2), R32(w2)
453 neg un
455 mov R32(vn), %eax
456 and $3, %eax
457 jz L(diag_prologue_0)
458 cmp $2, %eax
459 jc L(diag_prologue_1)
460 jz L(diag_prologue_2)
462 L(diag_prologue_3):
463 lea -8(vp), vp
464 mov vp, vp_inner
465 add $1, vn
466 mov vn, n
467 lea L(diag_entry_3)(%rip), outer_addr
468 jmp L(diag_entry_3)
470 L(diag_prologue_0):
471 mov vp, vp_inner
472 mov vn, n
473 lea 0(%rip), outer_addr
474 mov -8(up,n,8), %rax
475 jmp L(diag_entry_0)
477 L(diag_prologue_1):
478 lea 8(vp), vp
479 mov vp, vp_inner
480 add $3, vn
481 mov vn, n
482 lea 0(%rip), outer_addr
483 mov -8(vp_inner), %rax
484 jmp L(diag_entry_1)
486 L(diag_prologue_2):
487 lea -16(vp), vp
488 mov vp, vp_inner
489 add $2, vn
490 mov vn, n
491 lea 0(%rip), outer_addr
492 mov 16(vp_inner), %rax
493 jmp L(diag_entry_2)
496 C this loop is 10 c/loop = 2.5 c/l on K8
498 ALIGN(16)
499 L(diag_top):
500 add %rax, w0
501 adc %rdx, w1
502 mov -8(up,n,8), %rax
503 adc $0, w2
504 L(diag_entry_0):
505 mulq (vp_inner)
506 add %rax, w0
507 adc %rdx, w1
508 adc $0, w2
509 L(diag_entry_3):
510 mov -16(up,n,8), %rax
511 mulq 8(vp_inner)
512 add %rax, w0
513 mov 16(vp_inner), %rax
514 adc %rdx, w1
515 adc $0, w2
516 L(diag_entry_2):
517 mulq -24(up,n,8)
518 add %rax, w0
519 mov 24(vp_inner), %rax
520 adc %rdx, w1
521 lea 32(vp_inner), vp_inner
522 adc $0, w2
523 L(diag_entry_1):
524 mulq -32(up,n,8)
525 sub $4, n
526 jnz L(diag_top)
528 add %rax, w0
529 adc %rdx, w1
530 adc $0, w2
532 mov w0, (rp,un,8)
534 inc un
535 jz L(diag_end)
537 mov vn, n
538 mov vp, vp_inner
540 lea 8(up), up
541 mov w1, w0
542 mov w2, w1
543 xor R32(w2), R32(w2)
545 jmp *outer_addr
547 L(diag_end):
548 mov w1, (rp)
549 mov w2, 8(rp)
551 L(ret): pop %r15
552 pop %r14
553 pop %r13
554 pop %r12
555 pop %rbp
556 pop %rbx
557 FUNC_EXIT()
559 EPILOGUE()