1 dnl AMD64 mpn_mul_basecase optimised for AMD bobcat.
3 dnl Copyright
2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation
, Inc.
5 dnl
This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
8 dnl it under the terms of
either:
10 dnl
* the GNU Lesser General
Public License as published by the Free
11 dnl Software Foundation
; either version 3 of the License, or (at your
12 dnl option
) any later version.
16 dnl
* the GNU General
Public License as published by the Free Software
17 dnl Foundation
; either version 2 of the License, or (at your option) any
20 dnl
or both
in parallel
, as here.
22 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
23 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
24 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
27 dnl You should have received copies of the GNU General
Public License
and the
28 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
29 dnl see
https://www.gnu.
org/licenses
/.
31 include(`..
/config.m4
')
45 C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
46 C multiply insn bandwidth, without any apparent loop branch exit pipeline
47 C replays experienced on K8. The structure is unusual: it falls into mul_1 in
48 C the same way for all n, then it splits into 4 different wind-down blocks and
49 C 4 separate addmul_1 loops.
51 C We have not tried using the same addmul_1 loops with a switch into feed-in
52 C code, as we do in other basecase implementations. Doing that could save
53 C substantial code volume, but would also probably add some overhead.
57 C * Fix slowdown for un=vn=3 (67->71) compared to default code.
58 C * This is 1263 bytes, compared to 1099 bytes for default code. Consider
59 C combining addmul loops like that code. Tolerable slowdown?
60 C * Lots of space could be saved by replacing the "switch" code by gradual
61 C jumps out from mul_1 winddown code, perhaps with no added overhead.
62 C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
70 define(`un_param', `
%rdx
')
73 C Standard allocations
82 C Temp macro for allowing control over indexing.
83 C Define to return $1 for more conservative ptr handling.
90 PROLOGUE(mpn_mul_basecase)
92 IFDOS(` mov 56(%rsp), %r8d ')
107 L
(u2
): mul v0 C u0 x v0
122 L
(u2v2
):mov 8(vp
), v0
127 mov %rdx
, %r8 C
CAUTION: r8 realloc
148 lea -24(rp
,un_param
,8), rp
149 lea -24(up
,un_param
,8), up
161 L
(top
): mov w0
, -16(rp
,n
,8)
178 L
(L3
): mov 16(up
,n
,8), %rax
196 C Switch on n
into right addmul_l
loop
205 L
(r3
): mov w2
, X
(-8(rp
,n
,8),16(rp
))
206 mov w3
, X
((rp
,n
,8),24(rp
))
223 L
(ta3
): add w0
, -16(rp
,n
,8)
240 L
(al3
): mov 16(up
,n
,8), %rax
254 add w0
, X
(-16(rp
,n
,8),8(rp
))
257 add w2
, X
(-8(rp
,n
,8),16(rp
))
259 mov w3
, X
((rp
,n
,8),24(rp
))
263 L
(r2
): mov X
(0(up
,n
,8),(up
)), %rax
267 mov w2
, X
(-8(rp
,n
,8),-8(rp
))
270 mov X
(8(up
,n
,8),8(up
)), %rax
274 mov w0
, X
((rp
,n
,8),(rp
))
277 mov X
(16(up
,n
,8),16(up
)), %rax
281 mov w2
, X
(8(rp
,n
,8),8(rp
))
284 mov w0
, X
(16(rp
,n
,8),16(rp
))
286 mov w1
, X
(24(rp
,n
,8),24(rp
))
293 mov 16(up
,un
,8), %rax
303 L
(ta2
): add w0
, -16(rp
,n
,8)
327 L
(al2
): mov 24(up
,n
,8), %rax
334 add w0
, X
(-16(rp
,n
,8),8(rp
))
337 add w2
, X
(-8(rp
,n
,8),16(rp
))
339 mov w3
, X
((rp
,n
,8),24(rp
))
343 L
(r1
): mov X
(0(up
,n
,8),8(up
)), %rax
347 mov w2
, X
(-8(rp
,n
,8),(rp
))
350 mov X
(8(up
,n
,8),16(up
)), %rax
354 mov w0
, X
((rp
,n
,8),8(rp
))
357 mov w2
, X
(8(rp
,n
,8),16(rp
))
358 mov w3
, X
(16(rp
,n
,8),24(rp
))
365 mov -8(up
,un
,8), %rax
375 L
(ta1
): add w0
, -16(rp
,n
,8)
378 L
(al1
): mov (up
,n
,8), %rax
406 add w0
, X
(-16(rp
,n
,8),8(rp
))
409 add w2
, X
(-8(rp
,n
,8),16(rp
))
411 mov w3
, X
((rp
,n
,8),24(rp
))
415 L
(r0
): mov X
((up
,n
,8),16(up
)), %rax
419 mov w2
, X
(-8(rp
,n
,8),8(rp
))
422 mov w0
, X
((rp
,n
,8),16(rp
))
423 mov w1
, X
(8(rp
,n
,8),24(rp
))
440 L
(ta0
): add w0
, -16(rp
,n
,8)
450 L
(al0
): mov 8(up
,n
,8), %rax
471 add w0
, X
(-16(rp
,n
,8),8(rp
))
474 add w2
, X
(-8(rp
,n
,8),16(rp
))
476 mov w3
, X
((rp
,n
,8),24(rp
))