1 dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying
and SSSE3.
3 dnl Copyright
2012, 2013, 2015 Free Software Foundation
, Inc.
5 dnl Contributed to the GNU project by Torbjörn Granlund.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
35 C cycles/limb cycles/limb cycles/limb good
36 C aligned unaligned best seen for cpu?
37 C AMD K8,K9 2.0 illop 1.0/1.0 N
38 C AMD K10 0.85 illop Y/N
39 C AMD bull 0.70 0.66 Y
40 C AMD pile 0.68 0.66 Y
43 C AMD bobcat 1.97 8.16 1.5/1.5 N
44 C AMD jaguar 0.77 0.93 0.65/opt N/Y
45 C Intel P4 2.26 illop Y/N
46 C Intel core 0.52 0.64 opt/opt Y
47 C Intel NHM 0.52 0.71 opt/opt Y
48 C Intel SBR 0.51 0.54 opt/0.51 Y
49 C Intel IBR 0.50 0.54 opt/opt Y
50 C Intel HWL 0.50 0.51 opt/opt Y
51 C Intel BWL 0.55 0.55 opt/opt Y
52 C Intel atom 1.16 1.61 opt/opt Y
53 C Intel SLM 1.02 1.07 opt/opt Y
54 C VIA nano 1.09 1.08 opt/opt Y
56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
58 C instruction is better adapted to mpn_copyd's needs
, we need to contort the
59 C code to use it here.
61 C For operands of
< COPYI_SSE_THRESHOLD limbs
, we use a plain
64-bit
loop,
62 C taken from the x86_64 default code.
69 C There are three instructions for loading an aligned
128-bit quantity. We use
70 C movaps
, since it has the shortest coding.
71 dnl define
(`movdqa
', ``movaps'')
73 ifdef(`COPYI_SSE_THRESHOLD',`
',`define(`COPYI_SSE_THRESHOLD', 7)')
81 cmp $COPYI_SSE_THRESHOLD, n
84 test $8, R8(rp) C is rp 16-byte aligned?
85 jz L(rp_aligned) C jump if rp aligned
94 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
99 L(atop):movdqa 0(up), %xmm0
137 C Code handling up - rp = 8 (mod 16)
142 IFDOS(` add $-56, %rsp ')
143 IFDOS
(` movdqa
%xmm6
, (%rsp
) ')
144 IFDOS(` movdqa %xmm7, 16(%rsp) ')
145 IFDOS
(` movdqa
%xmm8
, 32(%rsp
) ')
147 movaps 120(up), %xmm7
148 movaps 104(up), %xmm6
158 L(utop):movaps -104(up), %xmm1
160 movaps -120(up), %xmm0
161 palignr($8, %xmm6, %xmm7)
162 movaps -136(up), %xmm8
163 movdqa %xmm7, 112(rp)
164 palignr($8, %xmm5, %xmm6)
165 movaps 120(up), %xmm7
167 palignr($8, %xmm4, %xmm5)
168 movaps 104(up), %xmm6
170 palignr($8, %xmm3, %xmm4)
173 palignr($8, %xmm2, %xmm3)
176 palignr($8, %xmm1, %xmm2)
179 palignr($8, %xmm0, %xmm1)
182 palignr($8, %xmm8, %xmm0)
188 L(ued1):movaps -104(up), %xmm1
189 movaps -120(up), %xmm0
190 movaps -136(up), %xmm8
191 palignr($8, %xmm6, %xmm7)
192 movdqa %xmm7, 112(rp)
193 palignr($8, %xmm5, %xmm6)
195 palignr($8, %xmm4, %xmm5)
197 palignr($8, %xmm3, %xmm4)
199 palignr($8, %xmm2, %xmm3)
201 palignr($8, %xmm1, %xmm2)
203 palignr($8, %xmm0, %xmm1)
205 palignr($8, %xmm8, %xmm0)
209 IFDOS(` movdqa (%rsp), %xmm6 ')
210 IFDOS
(` movdqa
16(%rsp
), %xmm7
')
211 IFDOS(` movdqa 32(%rsp), %xmm8 ')
212 IFDOS
(`
add $56, %rsp
')
214 L(ued0):test $8, R8(n)
221 palignr($8, %xmm2, %xmm3)
223 palignr($8, %xmm1, %xmm2)
225 palignr($8, %xmm0, %xmm1)
227 palignr($8, %xmm4, %xmm0)
236 palignr($8, %xmm0, %xmm1)
239 palignr($8, %xmm3, %xmm0)
248 palignr($8, %xmm3, %xmm0)
261 C Basecase code. Needed for good small operands speed, not for
262 C correctness as the above code is currently written.
264 L(bc): lea -8(rp), rp
269 L(top): mov (up), %r8
277 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
281 ifelse
(eval
(COPYI_SSE_THRESHOLD
>= 8),1,
284 L(end): test $1, R8(n)