1 dnl X86
-64 mpn_sqr_basecase optimised for Intel Nehalem
/Westmere.
2 dnl It also seems good for Conroe
/Wolfdale.
4 dnl Contributed to the GNU project by Torbjörn Granlund.
6 dnl Copyright
2008, 2011-2013 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
36 C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
45 C Intel core 4.9 4.18-4.25 3.87
46 C Intel NHM 3.8 4.06-4.2 3.5
54 C The inner loops of this code are the result of running a code generation and
55 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
60 C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
65 C ____________ ____________
68 C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
70 C \____________/ \____________/
80 C * Tune. None done so far.
81 C * Currently 2761 bytes, making it smaller would be nice.
82 C * Consider using a jumptab-based entry sequence. One might even use a mask-
83 C less sequence, if the table is large enough to support tuneup's needs.
84 C The code would be
, using non
-PIC code
,
85 C
lea tab
(%rip
),%rax
; jmp *(n,%rax)
87 C
lea tab
(%rip
),%rax
; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
88 C using PIC code. The table entries would be Ln1
,Ln2
,Ln3
,Lm0
,Lm1
,Lm2
,Lm3
,..
89 C with the last four entries repeated a safe number of times.
90 C
* Consider expanding feed
-in code
in order to avoid zeroing registers.
91 C
* Zero consistently with
xor.
92 C
* Check if using
"lea (reg),reg" should be done
in more places
; we have some
93 C explicit
"mov %rax,reg" now.
94 C
* Try zeroing with
xor in m2 loops.
95 C
* Try re
-rolling the m2 loops to avoid the current
9 insn code duplication
96 C between
loop header
and wind
-down code.
97 C
* Consider
adc reg
,reg instead of
adc $0,reg
in m2 loops.
This save a
byte.
99 C When playing with pointers
, set
this to
$2 to fall back to conservative
100 C indexing
in wind
-down code.
103 C Define
this to
$1 to use late
loop index variable as zero
, $2 to use an
109 define
(`n_param
', `%rdx')
124 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
129 define
(`ALIGNx
', `ALIGN(16)')
132 ifdef(`N',,`define
(`N
',0)')
133 define
(`
MOV', `ifelse(eval(N & $3),0,`mov $1, $2',`
lea ($1), $2')')
138 PROLOGUE
(mpn_sqr_basecase
)
155 sub n_param
, n C n
= -n_param
+1
158 lea (up
,n_param
,8), up
159 lea (rp
,n_param
,8), rp
166 L
(bx0
): test $2, R8
(n
)
170 L
(b00
): lea (n
), i C n
= 5, 9, ...
171 mov %rdx
, w1 C
FIXME: Use
lea?
175 L
(b10
): lea 2(n
), i C n
= 7, 11, ...
177 mov %rdx
, w3 C
FIXME: Use
lea?
182 L
(bx1
): test $2, R8
(n
)
186 L
(b01
): lea 1(n
), i C n
= 6, 10, ...
187 mov %rdx
, w0 C
FIXME: Use
lea?
191 L
(b11
): lea -1(n
), i C n
= 4, 8, 12, ...
192 mov %rdx
, w2 C
FIXME: Use
lea?
208 L
(m2e1
):mov $0, R32
(w2
)
248 mov I
(-8(up
),-8(up
,i
,8)), %rax
249 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
255 mov %rax
, I
((rp
),(rp
,i
,8))
256 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
259 add $2, n C decrease |n|
295 L
(m2e3
):mov $0, R32
(w0
)
313 mov I
(-8(up
),-8(up
,i
,8)), %rax
314 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
320 mov %rax
, I
((rp
),(rp
,i
,8))
321 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
324 add $2, n C decrease |n|
326 jz L
(cor1
) C jumps iff
entry n
= 4
409 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
411 mov %rax
, I
((rp
),(rp
,i
,8))
413 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
499 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
501 mov %rax
, I
((rp
),(rp
,i
,8))
503 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
519 jmp L
(sqr_diag_addlsh1
)
572 mov I
(-8(up
),-8(up
,i
,8)), %rax
573 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
579 mov %rax
, I
((rp
),(rp
,i
,8))
580 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
583 add $2, n C decrease |n|
608 L
(m2e0
):mov 8(up
,i
,8), %rax
637 mov I
(-8(up
),-8(up
,i
,8)), %rax
638 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
644 mov %rax
, I
((rp
),(rp
,i
,8))
645 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
648 add $2, n C decrease |n|
650 jz L
(cor2
) C jumps iff
entry n
= 5
733 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
735 mov %rax
, I
((rp
),(rp
,i
,8))
737 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
776 L
(lo0
): mov 8(up
,i
,8), %rax
822 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
824 mov %rax
, I
((rp
),(rp
,i
,8))
826 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
863 xor R32
(%rbx
), R32
(%rbx
)
877 lea (%rdx
,%rbx
), %r10
880 L
(dm
): mov %rax
, (rp
,n
,8)
891 lea (%rdx
,%rbx
), %r10
942 L
(n3
): mov %rax
, %r10