1 dnl AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge
and Ivy bridge.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright
2008, 2009, 2011-2013 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
35 C cycles/limb mul_2 addmul_2 sqr_diag_addlsh1
46 C Intel SBR 2.57 2.93 3.0
47 C Intel IBR 2.35 2.66 3.0
48 C Intel HWL 2.02 2.5 2.5
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
55 C that the sqr_diag_addlsh1 loop was manually written.
58 C * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
59 C * Streamline pointer updates.
60 C * Perhaps suppress a few more xor insns in feed-in code.
61 C * Make sure we write no dead registers in feed-in code.
62 C * We might use 32-bit size ops, since n >= 2^32 is non-terminating. Watch
63 C out for negative sizes being zero-extended, though.
64 C * The straight-line code for n <= 3 comes from the K8 code, and might be
65 C quite sub-optimal here. Write specific code, and add code for n = 4.
66 C * The mul_2 loop has a 10 insn common sequence in the loop start and the
67 C wind-down code. Try re-rolling it.
68 C * This file has been the subject to just basic micro-optimisation.
70 C When playing with pointers, set this to $2 to fall back to conservative
71 C indexing in wind-down code.
76 define(`un_param',`
%rdx
')
85 PROLOGUE(mpn_sqr_basecase)
125 L(gt2): cmp $4, un_param
193 lea (rp,un_param,8), rp C point rp at R[un]
196 lea (up,un_param,8), up C point up right after U's
end
199 mov $1, R32
(un
) C free up rdx
211 xor R32
(w1
), R32
(w1
) C FIXME
212 xor R32
(w2
), R32
(w2
) C FIXME
217 xor R32
(w3
), R32
(w3
) C FIXME
218 xor R32
(w0
), R32
(w0
) C FIXME
257 mov I
(-8(up
),-8(up
,n
,8)), %rax
262 mov w0
, I
(-8(rp
),-8(rp
,n
,8))
265 mov w2
, I
((rp
),(rp
,n
,8))
267 mov %rdx
, I
(8(rp
),8(rp
,n
,8))
269 add $2, un C decrease |un|
274 cmp $
-2, R32
(un
) C jump if un C
{-1,0} FIXME jump if un C
{-2,1}
275 jge L
(corner
) C
FIXME: move to before the
lea above
284 L
(a1x0
):mov (rp
,un
,8), X0
296 L
(a100
):lea 2(un
), n C un
= 4, 8, 12, ...
299 L
(a110
):lea (un
), n C un
= 2, 6, 10, ...
302 L
(a1x1
):mov (rp
,un
,8), X1
313 L
(a101
):lea 3(un
), n C un
= 1, 5, 9, ...
316 L
(a111
):lea 1(un
), n C un
= 3, 7, 11, ...
328 mov -16(up
,n
,8), %rax
333 mov -16(up
,n
,8), %rax
392 mov X1
, I
(-8(rp
),-24(rp
,n
,8))
395 mov %rax
, I
((rp
),-16(rp
,n
,8))
396 mov %rdx
, I
(8(rp
),-8(rp
,n
,8))
398 add $2, un C decrease |un|
399 jmp L
(outer
) C
loop until a
small corner remains
449 xor R32
(%rbx
), R32
(%rbx
)
455 L
(dtop
):add %r8
, %r10
464 lea (%rdx
,%rbx
), %r10
470 L
(dend
):add %r8
, %r10
472 mov %r10
, I
(-8(rp
),-8(rp
,n
,8))
473 mov %rax
, I
((rp
),(rp
,n
,8))
475 mov %rdx
, I
(8(rp
),8(rp
,n
,8))