1 dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
3 dnl Contributed to the GNU project by David Harvey
and Torbjorn Granlund.
5 dnl Copyright
2010-2012 Free Software Foundation
, Inc.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
36 C cycles/limb cycles/limb good
37 C 16-byte aligned 16-byte unaligned for cpu?
39 C AMD K10 1.68 (1.45) 1.75 (1.49) Y
40 C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
42 C Intel P4 3 (2.7) 3 (2.7) Y
43 C Intel core2 2.05 (1.67) 2.55 (1.75)
44 C Intel NHM 2.05 (1.75) 2.09 (2)
45 C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
47 C VIA nano 2.25 (2) 2.5 (2) Y
49 C We try to do as many 16-byte operations as possible. The top-most and
50 C bottom-most writes might need 8-byte operations.
52 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
53 C not true. The aligned case reads 16+8 bytes, the unaligned case reads
54 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
56 C This is not yet great code:
57 C (1) The unaligned case makes many reads.
58 C (2) We should do some unrolling, at least 2-way.
59 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
74 sub R32(%rcx), R32(%rax)
84 lea (rp,n,8), R32(%rcx)
88 C Do one initial limb in order to make rp aligned
89 movq -8(ap,n,8), %xmm0
90 movq -16(ap,n,8), %xmm1
94 movq %xmm0, -8(rp,n,8)
98 lea (ap,n,8), R32(%rcx)
102 C *****************************************************************************
104 C Handle the case when ap != rp (mod 16).
107 L(utop):movdqa -8(ap,n,8), %xmm0
109 punpcklqdq 8(ap,n,8), %xmm1
113 movdqa %xmm0, (rp,n,8)
121 punpcklqdq %xmm1, %xmm0
122 punpcklqdq 8(ap), %xmm1
128 C *****************************************************************************
130 C Handle the case when ap = rp (mod 16).
133 L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
134 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
135 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
139 movdqa %xmm0, (rp,n,8)
147 punpcklqdq %xmm1, %xmm0
153 C *****************************************************************************
165 L(end8):movq (ap), %xmm0