beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / copyi.asm
blobb2f3b9ddb895c2f3eb2bbf0894df8df4a1ff1a9d
1 dnl AMD64 mpn_copyi optimised for CPUs with fast SSE.
3 dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
4 dnl Inc.
6 dnl Contributed to the GNU project by Torbjörn Granlund.
8 dnl This file is part of the GNU MP Library.
9 dnl
10 dnl The GNU MP Library is free software; you can redistribute it and/or modify
11 dnl it under the terms of either:
12 dnl
13 dnl * the GNU Lesser General Public License as published by the Free
14 dnl Software Foundation; either version 3 of the License, or (at your
15 dnl option) any later version.
16 dnl
17 dnl or
18 dnl
19 dnl * the GNU General Public License as published by the Free Software
20 dnl Foundation; either version 2 of the License, or (at your option) any
21 dnl later version.
22 dnl
23 dnl or both in parallel, as here.
24 dnl
25 dnl The GNU MP Library is distributed in the hope that it will be useful, but
26 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 dnl for more details.
29 dnl
30 dnl You should have received copies of the GNU General Public License and the
31 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
32 dnl see https://www.gnu.org/licenses/.
34 include(`../config.m4')
36 C cycles/limb cycles/limb cycles/limb good
37 C aligned unaligned best seen for cpu?
38 C AMD K8,K9
39 C AMD K10 0.85 1.64 Y/N
40 C AMD bull 1.4 1.4 N
41 C AMD pile 0.77 0.93 N
42 C AMD steam ? ?
43 C AMD excavator ? ?
44 C AMD bobcat
45 C AMD jaguar 0.65 1.02 opt/0.93 Y/N
46 C Intel P4 2.3 2.3 Y
47 C Intel core 1.0 1.0 0.52/0.64 N
48 C Intel NHM 0.5 0.67 Y
49 C Intel SBR 0.51 0.75 opt/0.54 Y/N
50 C Intel IBR 0.50 0.57 opt/0.54 Y
51 C Intel HWL 0.50 0.57 opt/0.51 Y
52 C Intel BWL 0.55 0.62 opt/0.55 Y
53 C Intel atom
54 C Intel SLM 1.02 1.27 opt/1.07 Y/N
55 C VIA nano 1.16 5.16 Y/N
57 C We try to do as many 16-byte operations as possible. The top-most and
58 C bottom-most writes might need 8-byte operations. We can always write using
59 C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
60 C operations.
62 C Instead of having separate loops for reading aligned and unaligned, we read
63 C using MOVDQU. This seems to work great except for core2; there performance
64 C doubles when reading using MOVDQA (for aligned source). It is unclear how to
65 C best handle the unaligned case there.
67 C INPUT PARAMETERS
68 define(`rp', `%rdi')
69 define(`up', `%rsi')
70 define(`n', `%rdx')
72 ABI_SUPPORT(DOS64)
73 ABI_SUPPORT(STD64)
75 dnl define(`movdqu', lddqu)
77 ASM_START()
78 TEXT
79 ALIGN(64)
80 PROLOGUE(mpn_copyi)
81 FUNC_ENTRY(3)
83 cmp $3, n C NB: bc code below assumes this limit
84 jc L(bc)
86 test $8, R8(rp) C is rp 16-byte aligned?
87 jz L(ali) C jump if rp aligned
88 movsq C copy single limb
89 dec n
91 sub $16, n
92 jc L(sma)
94 ALIGN(16)
95 L(top): movdqu (up), %xmm0
96 movdqu 16(up), %xmm1
97 movdqu 32(up), %xmm2
98 movdqu 48(up), %xmm3
99 movdqu 64(up), %xmm4
100 movdqu 80(up), %xmm5
101 movdqu 96(up), %xmm6
102 movdqu 112(up), %xmm7
103 lea 128(up), up
104 movdqa %xmm0, (rp)
105 movdqa %xmm1, 16(rp)
106 movdqa %xmm2, 32(rp)
107 movdqa %xmm3, 48(rp)
108 movdqa %xmm4, 64(rp)
109 movdqa %xmm5, 80(rp)
110 movdqa %xmm6, 96(rp)
111 movdqa %xmm7, 112(rp)
112 lea 128(rp), rp
113 L(ali): sub $16, n
114 jnc L(top)
116 L(sma): test $8, R8(n)
117 jz 1f
118 movdqu (up), %xmm0
119 movdqu 16(up), %xmm1
120 movdqu 32(up), %xmm2
121 movdqu 48(up), %xmm3
122 lea 64(up), up
123 movdqa %xmm0, (rp)
124 movdqa %xmm1, 16(rp)
125 movdqa %xmm2, 32(rp)
126 movdqa %xmm3, 48(rp)
127 lea 64(rp), rp
129 test $4, R8(n)
130 jz 1f
131 movdqu (up), %xmm0
132 movdqu 16(up), %xmm1
133 lea 32(up), up
134 movdqa %xmm0, (rp)
135 movdqa %xmm1, 16(rp)
136 lea 32(rp), rp
138 test $2, R8(n)
139 jz 1f
140 movdqu (up), %xmm0
141 lea 16(up), up
142 movdqa %xmm0, (rp)
143 lea 16(rp), rp
144 ALIGN(16)
146 L(end): test $1, R8(n)
147 jz 1f
148 mov (up), %r8
149 mov %r8, (rp)
151 FUNC_EXIT()
154 C Basecase code. Needed for good small operands speed, not for correctness as
155 C the above code is currently written. The commented-out lines need to be
156 C reinstated if this code is to be used for n > 3, and then the post loop
157 C offsets need fixing.
159 L(bc): sub $2, n
160 jc L(end)
161 ALIGN(16)
162 1: mov (up), %rax
163 mov 8(up), %rcx
164 dnl lea 16(up), up
165 mov %rax, (rp)
166 mov %rcx, 8(rp)
167 dnl lea 16(rp), rp
168 dnl sub $2, n
169 dnl jnc 1b
171 test $1, R8(n)
172 jz L(ret)
173 mov 16(up), %rax
174 mov %rax, 16(rp)
175 L(ret): FUNC_EXIT()
177 EPILOGUE()