1 dnl AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
3 dnl Copyright
2015 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
33 C cycles/limb mul_1 addmul_1
48 C Intel BWL 1.69 1.8-1.9
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
57 C * We have 8 addmul_1 loops which fall into each other. The idea is to save
58 C on switching code, since a circularly updated computed goto target will
59 C hardly allow correct branch prediction. On 2nd thought, we now might make
60 C each of the 8 loop branches be poorly predicted since they will be
61 C executed fewer times for each time. With just one addmul_1 loop, the loop
62 C count will change only once each 8th time!
63 C * Replace sqr_diag_addlsh1 code (from haswell) with adx-aware code. We have
64 C 3 variants below, but the haswell code turns out to be fastest.
65 C * Do overlapped software pipelining.
66 C * When changing this, make sure the code which falls into the inner loops
67 C does not execute too many no-ops (for both PIC and non-PIC).
71 define(`un_param',`
%rdx
')
74 define(`un_save', `
%rbx
')
88 PROLOGUE(mpn_sqr_basecase)
95 mulx( %rdx, %rax, %rdx)
105 mulx( %rcx, %r9, %r10) C v0 * v1 W 1 2
106 mulx( %rdx, %rax, %r8) C v0 * v0 W 0 1
108 mulx( %rdx, %r11, %rdx) C v1 * v1 W 2 3
122 L(gt2): cmp $4, un_param
127 mulx( 8,(up), w2, w3)
128 mulx( 16,(up), w0, w1)
131 mulx( 16,(up), %rax, w3)
134 test R32(%rbx), R32(%rbx)
136 mulx( %rdx, %rbx, %rcx)
139 mulx( %rdx, %rax, %rbx)
141 mulx( %rdx, %rsi, %rdx)
168 lea -3(un_param), R32(un_save)
170 mov R32(un_param), R32(%rax)
171 and $-8, R32(un_save)
172 shr $3, R32(n) C count for mul_1 loop
173 neg un_save C 8*count and offert for addmul_1 loops
174 and $7, R32(%rax) C clear CF for adc as side-effect
178 lea L(mtab)(%rip), %r10
180 ` movslq
(%r10
,%rax
,4), %r8
181 lea (%r8
, %r10
), %r10
187 L
(mf0
): mulx
( 8,(up
), w2
, w3
)
192 L
(mf3
): mulx
( 8,(up
), w0
, w1
)
197 L
(mf4
): mulx
( 8,(up
), w2
, w3
)
202 L
(mf5
): mulx
( 8,(up
), w0
, w1
)
207 L
(mf6
): mulx
( 8,(up
), w2
, w3
)
212 L
(mf7
): mulx
( 8,(up
), w0
, w1
)
217 L
(mf1
): mulx
( 8,(up
), w0
, w1
)
222 L
(mf2
): mulx
( 8,(up
), w2
, w3
)
229 L
(top
): mov w2
, -8(rp
)
231 L
(mb1
): mulx
( 8,(up
), w2
, w3
)
235 L
(mb0
): mov w2
, 8(rp
)
236 mulx
( -48,(up
), w0
, w1
)
239 L
(mb7
): mulx
( -40,(up
), w2
, w3
)
242 L
(mb6
): mov w2
, -40(rp
)
243 mulx
( -32,(up
), w0
, w1
)
245 L
(mb5
): mulx
( -24,(up
), w2
, w3
)
248 L
(mb4
): mulx
( -16,(up
), w0
, w1
)
251 L
(mb3
): mulx
( -8,(up
), w2
, w3
)
258 L
(end): mov w2
, -8(rp
)
264 lea L
(atab
)(%rip
), %r10
266 ` movslq (%r10,%rax,4), %r11
267 lea (%r11, %r10), %r11
273 L(ed0): adox( (rp), w0)
274 adox( %rcx, w1) C relies on rcx = 0
276 adc %rcx, w1 C relies on rcx = 0
278 L(f7): lea -64(up,un_save,8), up
279 or R32(un_save), R32(n)
281 mulx( 16,(up), w0, w1)
282 lea -56(rp,un_save,8), rp
286 L(tp0): adox( -8,(rp), w2)
290 mulx( 8,(up), w2, w3)
295 L(b0): mulx( 16,(up), w0, w1)
299 mulx( 24,(up), w2, w3)
304 mulx( -32,(up), w0, w1)
308 mulx( -24,(up), w2, w3)
312 mulx( -16,(up), w0, w1)
317 mulx( -8,(up), w2, w3)
324 L(ed1): adox( (rp), w0)
325 adox( %rcx, w1) C relies on rcx = 0
327 adc %rcx, w1 C relies on rcx = 0
329 L(f0): lea -64(up,un_save,8), up
330 or R32(un_save), R32(n)
332 mulx( 8,(up), w2, w3)
333 lea -56(rp,un_save,8), rp
337 L(tp1): adox( -8,(rp), w2)
341 L(b1): mulx( 8,(up), w2, w3)
346 mulx( 16,(up), w0, w1)
350 mulx( 24,(up), w2, w3)
355 mulx( -32,(up), w0, w1)
359 mulx( -24,(up), w2, w3)
363 mulx( -16,(up), w0, w1)
368 mulx( -8,(up), w2, w3)
375 L(ed2): adox( (rp), w0)
376 adox( %rcx, w1) C relies on rcx = 0
378 adc %rcx, w1 C relies on rcx = 0
380 L(f1): lea (up,un_save,8), up
381 or R32(un_save), R32(n)
382 lea 8(un_save), un_save
385 lea -56(rp,un_save,8), rp
389 L(tp2): adox( -8,(rp), w2)
393 mulx( 8,(up), w2, w3)
398 mulx( 16,(up), w0, w1)
402 mulx( 24,(up), w2, w3)
407 mulx( -32,(up), w0, w1)
411 mulx( -24,(up), w2, w3)
415 mulx( -16,(up), w0, w1)
420 mulx( -8,(up), w2, w3)
427 L(ed3): adox( (rp), w0)
428 adox( %rcx, w1) C relies on rcx = 0
430 adc %rcx, w1 C relies on rcx = 0
432 L(f2): lea (up,un_save,8), up
433 or R32(un_save), R32(n)
436 mulx( -8,(up), w2, w3)
437 lea 8(rp,un_save,8), rp
442 L(tp3): adox( -8,(rp), w2)
446 mulx( 8,(up), w2, w3)
451 mulx( 16,(up), w0, w1)
455 mulx( 24,(up), w2, w3)
460 mulx( -32,(up), w0, w1)
464 mulx( -24,(up), w2, w3)
468 mulx( -16,(up), w0, w1)
472 L(b3): adox( 48,(rp), w0)
473 mulx( -8,(up), w2, w3)
480 L(ed4): adox( (rp), w0)
481 adox( %rcx, w1) C relies on rcx = 0
483 adc %rcx, w1 C relies on rcx = 0
485 L(f3): lea (up,un_save,8), up
486 or R32(un_save), R32(n)
489 mulx( -16,(up), w0, w1)
490 lea -56(rp,un_save,8), rp
494 L(tp4): adox( -8,(rp), w2)
498 mulx( 8,(up), w2, w3)
503 mulx( 16,(up), w0, w1)
507 mulx( 24,(up), w2, w3)
512 mulx( -32,(up), w0, w1)
516 mulx( -24,(up), w2, w3)
520 L(b4): mulx( -16,(up), w0, w1)
525 mulx( -8,(up), w2, w3)
532 L(ed5): adox( (rp), w0)
533 adox( %rcx, w1) C relies on rcx = 0
535 adc %rcx, w1 C relies on rcx = 0
537 L(f4): lea (up,un_save,8), up
538 or R32(un_save), R32(n)
540 mulx( -24,(up), w2, w3)
541 lea -56(rp,un_save,8), rp
545 L(tp5): adox( -8,(rp), w2)
549 mulx( 8,(up), w2, w3)
554 mulx( 16,(up), w0, w1)
558 mulx( 24,(up), w2, w3)
563 mulx( -32,(up), w0, w1)
567 L(b5): mulx( -24,(up), w2, w3)
571 mulx( -16,(up), w0, w1)
576 mulx( -8,(up), w2, w3)
583 L(ed6): adox( (rp), w0)
584 adox( %rcx, w1) C relies on rcx = 0
586 adc %rcx, w1 C relies on rcx = 0
588 L(f5): lea (up,un_save,8), up
589 or R32(un_save), R32(n)
591 mulx( -32,(up), w0, w1)
592 lea -56(rp,un_save,8), rp
596 L(tp6): adox( -8,(rp), w2)
600 mulx( 8,(up), w2, w3)
605 mulx( 16,(up), w0, w1)
609 mulx( 24,(up), w2, w3)
614 L(b6): mulx( -32,(up), w0, w1)
618 mulx( -24,(up), w2, w3)
622 mulx( -16,(up), w0, w1)
627 mulx( -8,(up), w2, w3)
634 L(ed7): adox( (rp), w0)
635 adox( %rcx, w1) C relies on rcx = 0
637 adc %rcx, w1 C relies on rcx = 0
639 L(f6): lea (up,un_save,8), up
640 or R32(un_save), R32(n)
642 mulx( -40,(up), w2, w3)
643 lea -56(rp,un_save,8), rp
647 L(tp7): adox( -8,(rp), w2)
651 mulx( 8,(up), w2, w3)
656 mulx( 16,(up), w0, w1)
660 L(b7): mulx( 24,(up), w2, w3)
665 mulx( -32,(up), w0, w1)
669 mulx( -24,(up), w2, w3)
673 mulx( -16,(up), w0, w1)
678 mulx( -8,(up), w2, w3)
687 mulx( -16,(up), w0, w1)
689 mulx( -8,(up), w2, w3)
698 adox( %rcx, w1) C relies on rcx = 0
699 adcx( %rcx, w1) C relies on rcx = 0
702 mulx( -8,(up), w2, w3)
703 mulx( (up), %rax, %rbx)
708 adox( %rcx, %rbx) C relies on rcx = 0
710 adc %rcx, %rbx C relies on rcx = 0
712 mulx( (up), %rax, %rdx)
715 adc %rcx, %rdx C relies on rcx = 0
723 ifdef(`SDA_VARIANT',,`define
(`SDA_VARIANT
', 2)')
725 ifelse
(SDA_VARIANT
,1,`
727 movq
$0, -8(rp
,%rax
,8) C FIXME
728 test R32
(%rax
), R32
(%rax
)
731 mulx
( %rdx
, %r8
, %rdx
)
735 L
(dtop
):mov 8(rp
), %r9
742 mulx
( %rdx
, %rax
, %rdx
)
753 ifelse(SDA_VARIANT,2,`
756 xor R32(%rbx), R32(%rbx) C clear CF as side effect
757 mulx( %rdx, %rax, %r10)
764 L(dtop):mov 24(rp), %r8
767 lea (%rdx,%rbx), %r10
773 mulx( %rdx, %rax, %rdx)
781 L(dend):adc %rbx, %rdx
785 ifelse
(SDA_VARIANT
,3,`
788 test R32
(%rbx
), R32
(%rbx
) C clear CF
and OF
789 mulx
( %rdx
, %rax
, %r10
)
796 L
(dtop
):jrcxz L
(dend
)
800 L
(dm
): adcx
( %r8
, %r8
)
805 mulx
( %rdx
, %rax
, %r10
)
812 L
(dend
):adcx
( %rcx
, %r10
)
823 L(mtab):JMPENT( L(mf7), L(mtab))
824 JMPENT( L(mf0), L(mtab))
825 JMPENT( L(mf1), L(mtab))
826 JMPENT( L(mf2), L(mtab))
827 JMPENT( L(mf3), L(mtab))
828 JMPENT( L(mf4), L(mtab))
829 JMPENT( L(mf5), L(mtab))
830 JMPENT( L(mf6), L(mtab))
831 L(atab):JMPENT( L(f6), L(atab))
832 JMPENT( L(f7), L(atab))
833 JMPENT( L(f0), L(atab))
834 JMPENT( L(f1), L(atab))
835 JMPENT( L(f2), L(atab))
836 JMPENT( L(f3), L(atab))
837 JMPENT( L(f4), L(atab))
838 JMPENT( L(f5), L(atab))