1 dnl AMD64 mpn_com optimised for CPUs with fast SSE copying
and SSSE3.
3 dnl Copyright
2012, 2013, 2015 Free Software Foundation
, Inc.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl
This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software
; you can redistribute it and/or modify
10 dnl it under the terms of
either:
12 dnl
* the GNU Lesser General
Public License as published by the Free
13 dnl Software Foundation
; either version 3 of the License, or (at your
14 dnl option
) any later version.
18 dnl
* the GNU General
Public License as published by the Free Software
19 dnl Foundation
; either version 2 of the License, or (at your option) any
22 dnl
or both
in parallel
, as here.
24 dnl The GNU MP Library is distributed
in the hope that it will be useful
, but
25 dnl WITHOUT ANY WARRANTY
; without even the implied warranty of MERCHANTABILITY
26 dnl
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License
29 dnl You should have received copies of the GNU General
Public License
and the
30 dnl GNU Lesser General
Public License along with the GNU MP Library. If
not,
31 dnl see
https://www.gnu.
org/licenses
/.
33 include(`..
/config.m4
')
35 C cycles/limb cycles/limb cycles/limb good
36 C aligned unaligned best seen for cpu?
37 C AMD K8,K9 2.0 illop 1.0/1.0 N
38 C AMD K10 0.85 illop Y/N
39 C AMD bull 1.39 ? 1.45 Y/N
40 C AMD pile 0.8-1.4 0.7-1.4 Y
43 C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
44 C AMD jaguar 1.02 1.02 0.91/0.91 N
45 C Intel P4 2.26 illop Y/N
46 C Intel core 0.52 0.95 opt/0.74 Y
47 C Intel NHM 0.52 0.65 opt/opt Y
48 C Intel SBR 0.51 0.65 opt/opt Y
49 C Intel IBR 0.50 0.64 opt/0.57 Y
50 C Intel HWL 0.51 0.58 opt/opt Y
51 C Intel BWL 0.57 0.69 opt/0.65 Y
52 C Intel atom 1.16 1.70 opt/opt Y
53 C Intel SLM 1.02 1.52 N
54 C VIA nano 1.09 1.10 opt/opt Y
56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
58 C instruction is better adapted to mpn_copyd's needs
, we need to contort the
59 C code to use it here.
61 C For operands of
< COM_SSE_THRESHOLD limbs
, we use a plain
64-bit
loop, taken
62 C from the x86_64 default code.
69 C There are three instructions for loading an aligned
128-bit quantity. We use
70 C movaps
, since it has the shortest coding.
71 define
(`movdqa
', ``movaps'')
73 ifdef(`COM_SSE_THRESHOLD',`
',`define(`COM_SSE_THRESHOLD', 7)')
81 cmp $COM_SSE_THRESHOLD, n
84 pcmpeqb %xmm7, %xmm7 C set to 111...111
86 test $8, R8(rp) C is rp 16-byte aligned?
87 jz L(rp_aligned) C jump if rp aligned
100 ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
105 L(atop):movdqa 0(up), %xmm0
151 C Code handling up - rp = 8 (mod 16)
153 C FIXME: The code below only handles overlap if it is close to complete, or
154 C quite separate: up-rp < 5 or up-up > 15 limbs
155 lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
157 cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
158 jbe L(bc) C deflect to plain loop
163 movdqa 120(up), %xmm3
169 L(utop):movdqa 120(up), %xmm3
171 movdqa %xmm0, -128(rp)
173 L(um): movdqa 104(up), %xmm2
174 palignr($8, %xmm2, %xmm3)
177 movdqa %xmm3, 112(rp)
178 palignr($8, %xmm1, %xmm2)
182 palignr($8, %xmm0, %xmm1)
186 palignr($8, %xmm3, %xmm0)
190 palignr($8, %xmm2, %xmm3)
194 palignr($8, %xmm1, %xmm2)
198 palignr($8, %xmm0, %xmm1)
202 palignr($8, %xmm3, %xmm0)
208 movdqa %xmm0, -128(rp)
210 L(uend):test $8, R8(n)
214 palignr($8, %xmm2, %xmm3)
218 palignr($8, %xmm1, %xmm2)
222 palignr($8, %xmm0, %xmm1)
226 palignr($8, %xmm3, %xmm0)
236 palignr($8, %xmm0, %xmm1)
240 palignr($8, %xmm3, %xmm0)
250 palignr($8, %xmm3, %xmm0)
265 C Basecase code. Needed for good small operands speed, not for
266 C correctness as the above code is currently written.
268 L(bc): lea -8(rp), rp
272 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
274 L
(top
): mov (up
), %r8
286 ifelse
(eval
(1 || COM_SSE_THRESHOLD
>= 8),1,
290 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
293 L
(end): test $1, R8
(n
)