1 dnl X86
-64 mpn_mul_basecase optimised for Intel Nehalem
/Westmere.
2 dnl It also seems good for Conroe
/Wolfdale.
4 dnl Contributed to the GNU project by Torbjörn Granlund.
6 dnl Copyright
2008, 2011-2013 Free Software Foundation
, Inc.
8 dnl
This file is part of the GNU MP Library.
10 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
11 dnl it under the terms of
either:
13 dnl
* the GNU Lesser General
Public License as published by the Free
14 dnl Software Foundation
; either version 3 of the License, or (at your
15 dnl option
) any later version.
19 dnl
* the GNU General
Public License as published by the Free Software
20 dnl Foundation
; either version 2 of the License, or (at your option) any
23 dnl
or both
in parallel
, as here.
25 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
26 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
27 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
30 dnl You should have received copies of the GNU General
Public License
and the
31 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
32 dnl see
https://www.gnu.
org/licenses
/.
34 include(`..
/config.m4
')
36 C cycles/limb mul_1 mul_2 mul_3 addmul_2
45 C Intel core 4.0 4.0 - 4.18-4.25
46 C Intel NHM 3.75 3.8 - 4.06-4.2
54 C The inner loops of this code are the result of running a code generation and
55 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
60 C m_1(0m4) m_1(1m4) m_1(2m4) m_1(3m4)
62 C m_2(0m4) | m_2(1m4) | m_2(2m4) | m_2(3m4) |
66 C \|/ |/_ \|/ |/_ \|/ |/_ \|/ |/_
67 C _____ _____ _____ _____
69 C \|/ | \|/ | \|/ | \|/ |
70 C am_2(0m4) | am_2(1m4) | am_2(2m4) | am_2(3m4) |
71 C \ /|\ \ /|\ \ /|\ \ /|\
72 C \_____/ \_____/ \_____/ \_____/
75 C * Tune. None done so far.
76 C * Currently 2687 bytes, making it smaller would be nice.
77 C * Implement some basecases, say for un < 4.
78 C * Try zeroing with xor in m2 loops.
79 C * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
80 C between loop header and wind-down code.
81 C * Consider adc reg,reg instead of adc $0,reg in m2 loops. This save a byte.
83 C When playing with pointers, set this to $2 to fall back to conservative
84 C indexing in wind-down code.
87 C Define this to $1 to use late loop index variable as zero, $2 to use an
93 define(`un_param', `
%rdx
')
94 define(`vp_param', `
%rcx
') C FIXME reallocate vp to rcx but watch performance!
95 define(`vn_param', `
%r8
')
98 define(`vn', `
(%rsp
)')
112 C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
117 define(`ALIGNx', `
ALIGN(16)')
120 ifdef
(`N
',,`define(`N',0)')
121 define(`MOV', `ifelse
(eval
(N
& $3),0,`
mov $1, $2',`lea ($1), $2')')
126 PROLOGUE(mpn_mul_basecase)
128 IFDOS(` mov 56(%rsp), %r8d ')
129 mov (up
), %rax C shared for mul_1
and mul_2
136 mov (vp_param
), v0 C shared for mul_1
and mul_2
139 sub un_param
, un C un
= -un_param
141 lea (up
,un_param
,8), up
142 lea (rp
,un_param
,8), rp
144 mul v0 C shared for mul_1
and mul_2
146 test $1, R8
(vn_param
)
149 lea 8(vp_param
), vp C
FIXME: delay until known needed
154 L
(m1x0
):test $2, R8
(un
)
161 mov %rdx
, w0 C
FIXME: Use
lea?
162 lea L
(do_am0
)(%rip
), %rbp
169 mov %rdx
, w0 C
FIXME: Use
lea?
171 lea L
(do_am2
)(%rip
), %rbp
176 mov w0
, I
(-8(rp
),8(rp
,un
,8))
177 mov %rdx
, I
((rp
),16(rp
,un
,8))
180 L
(m1x1
):test $2, R8
(un
)
189 mov %rdx
, w1 C
FIXME: Use
lea?
190 lea L
(do_am1
)(%rip
), %rbp
192 L
(1): mov %rdx
, I
((rp
),8(rp
,un
,8))
199 mov %rdx
, w1 C
FIXME: Use
lea?
200 lea L
(do_am3
)(%rip
), %rbp
207 L
(m1e2
):xor R32
(w1
), R32
(w1
)
212 L
(m1e1
):xor R32
(w0
), R32
(w0
)
218 L
(m1e0
):xor R32
(w1
), R32
(w1
)
224 L
(m1e3
):xor R32
(w0
), R32
(w0
)
233 mov w1
, I
(-16(rp
),-16(rp
,i
,8))
236 mov w0
, I
(-8(rp
),-8(rp
,i
,8))
237 mov %rdx
, I
((rp
),(rp
,i
,8))
246 lea 16(vp_param
), vp C
FIXME: delay until known needed
251 L
(bx0
): test $2, R8
(un
)
256 mov %rdx
, w1 C
FIXME: Use
lea?
261 L
(b10
): lea -2(un
), i
262 mov %rax
, w2 C
FIXME: Use
lea?
264 mov %rdx
, w3 C
FIXME: Use
lea?
268 L
(bx1
): test $2, R8
(un
)
274 mov %rdx
, w0 C
FIXME: Use
lea?
278 L
(b11
): lea -1(un
), i
279 mov %rax
, w1 C
FIXME: Use
lea?
281 mov %rdx
, w2 C
FIXME: Use
lea?
336 mov I
(-8(up
),-8(up
,i
,8)), %rax
337 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
343 mov w0
, I
((rp
),(rp
,i
,8))
344 mov w1
, I
(8(rp
),8(rp
,i
,8))
392 L
(lo0
): mov 8(up
,i
,8), %rax
438 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
440 mov X0
, I
((rp
),(rp
,i
,8))
442 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
509 mov I
(-8(up
),-8(up
,i
,8)), %rax
510 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
516 mov w0
, I
((rp
),(rp
,i
,8))
517 mov w1
, I
(8(rp
),8(rp
,i
,8))
611 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
613 mov X0
, I
((rp
),(rp
,i
,8))
615 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
682 mov I
(-8(up
),-8(up
,i
,8)), %rax
683 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
689 mov w0
, I
((rp
),(rp
,i
,8))
690 mov w1
, I
(8(rp
),8(rp
,i
,8))
784 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
786 mov X0
, I
((rp
),(rp
,i
,8))
788 mov %rdx
, I
(8(rp
),8(rp
,i
,8))
855 mov I
(-8(up
),-8(up
,i
,8)), %rax
856 mov w3
, I
(-8(rp
),-8(rp
,i
,8))
862 mov w0
, I
((rp
),(rp
,i
,8))
863 mov w1
, I
(8(rp
),8(rp
,i
,8))
957 mov X1
, I
(-8(rp
),-8(rp
,i
,8))
959 mov X0
, I
((rp
),(rp
,i
,8))
961 mov %rdx
, I
(8(rp
),8(rp
,i
,8))