beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / copyi-palignr.asm
blob22f13f1d86261c2d8eb077df42840f64417b20db
1 dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
3 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
5 dnl Contributed to the GNU project by Torbjörn Granlund.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb cycles/limb cycles/limb good
36 C aligned unaligned best seen for cpu?
37 C AMD K8,K9 2.0 illop 1.0/1.0 N
38 C AMD K10 0.85 illop Y/N
39 C AMD bull 0.70 0.66 Y
40 C AMD pile 0.68 0.66 Y
41 C AMD steam ? ?
42 C AMD excavator ? ?
43 C AMD bobcat 1.97 8.16 1.5/1.5 N
44 C AMD jaguar 0.77 0.93 0.65/opt N/Y
45 C Intel P4 2.26 illop Y/N
46 C Intel core 0.52 0.64 opt/opt Y
47 C Intel NHM 0.52 0.71 opt/opt Y
48 C Intel SBR 0.51 0.54 opt/0.51 Y
49 C Intel IBR 0.50 0.54 opt/opt Y
50 C Intel HWL 0.50 0.51 opt/opt Y
51 C Intel BWL 0.55 0.55 opt/opt Y
52 C Intel atom 1.16 1.61 opt/opt Y
53 C Intel SLM 1.02 1.07 opt/opt Y
54 C VIA nano 1.09 1.08 opt/opt Y
56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
58 C instruction is better adapted to mpn_copyd's needs, we need to contort the
59 C code to use it here.
61 C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
62 C taken from the x86_64 default code.
64 C INPUT PARAMETERS
65 define(`rp', `%rdi')
66 define(`up', `%rsi')
67 define(`n', `%rdx')
69 C There are three instructions for loading an aligned 128-bit quantity. We use
70 C movaps, since it has the shortest coding.
71 dnl define(`movdqa', ``movaps'')
73 ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
75 ASM_START()
76 TEXT
77 ALIGN(64)
78 PROLOGUE(mpn_copyi)
79 FUNC_ENTRY(3)
81 cmp $COPYI_SSE_THRESHOLD, n
82 jbe L(bc)
84 test $8, R8(rp) C is rp 16-byte aligned?
85 jz L(rp_aligned) C jump if rp aligned
87 movsq C copy one limb
88 dec n
90 L(rp_aligned):
91 test $8, R8(up)
92 jnz L(uent)
94 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
95 ` sub $8, n',
96 ` jmp L(am)')
98 ALIGN(16)
99 L(atop):movdqa 0(up), %xmm0
100 movdqa 16(up), %xmm1
101 movdqa 32(up), %xmm2
102 movdqa 48(up), %xmm3
103 lea 64(up), up
104 movdqa %xmm0, (rp)
105 movdqa %xmm1, 16(rp)
106 movdqa %xmm2, 32(rp)
107 movdqa %xmm3, 48(rp)
108 lea 64(rp), rp
109 L(am): sub $8, n
110 jnc L(atop)
112 test $4, R8(n)
113 jz 1f
114 movdqa (up), %xmm0
115 movdqa 16(up), %xmm1
116 lea 32(up), up
117 movdqa %xmm0, (rp)
118 movdqa %xmm1, 16(rp)
119 lea 32(rp), rp
121 1: test $2, R8(n)
122 jz 1f
123 movdqa (up), %xmm0
124 lea 16(up), up
125 movdqa %xmm0, (rp)
126 lea 16(rp), rp
128 1: test $1, R8(n)
129 jz 1f
130 mov (up), %r8
131 mov %r8, (rp)
133 1: FUNC_EXIT()
136 L(uent):
137 C Code handling up - rp = 8 (mod 16)
139 cmp $16, n
140 jc L(ued0)
142 IFDOS(` add $-56, %rsp ')
143 IFDOS(` movdqa %xmm6, (%rsp) ')
144 IFDOS(` movdqa %xmm7, 16(%rsp) ')
145 IFDOS(` movdqa %xmm8, 32(%rsp) ')
147 movaps 120(up), %xmm7
148 movaps 104(up), %xmm6
149 movaps 88(up), %xmm5
150 movaps 72(up), %xmm4
151 movaps 56(up), %xmm3
152 movaps 40(up), %xmm2
153 lea 128(up), up
154 sub $32, n
155 jc L(ued1)
157 ALIGN(16)
158 L(utop):movaps -104(up), %xmm1
159 sub $16, n
160 movaps -120(up), %xmm0
161 palignr($8, %xmm6, %xmm7)
162 movaps -136(up), %xmm8
163 movdqa %xmm7, 112(rp)
164 palignr($8, %xmm5, %xmm6)
165 movaps 120(up), %xmm7
166 movdqa %xmm6, 96(rp)
167 palignr($8, %xmm4, %xmm5)
168 movaps 104(up), %xmm6
169 movdqa %xmm5, 80(rp)
170 palignr($8, %xmm3, %xmm4)
171 movaps 88(up), %xmm5
172 movdqa %xmm4, 64(rp)
173 palignr($8, %xmm2, %xmm3)
174 movaps 72(up), %xmm4
175 movdqa %xmm3, 48(rp)
176 palignr($8, %xmm1, %xmm2)
177 movaps 56(up), %xmm3
178 movdqa %xmm2, 32(rp)
179 palignr($8, %xmm0, %xmm1)
180 movaps 40(up), %xmm2
181 movdqa %xmm1, 16(rp)
182 palignr($8, %xmm8, %xmm0)
183 lea 128(up), up
184 movdqa %xmm0, (rp)
185 lea 128(rp), rp
186 jnc L(utop)
188 L(ued1):movaps -104(up), %xmm1
189 movaps -120(up), %xmm0
190 movaps -136(up), %xmm8
191 palignr($8, %xmm6, %xmm7)
192 movdqa %xmm7, 112(rp)
193 palignr($8, %xmm5, %xmm6)
194 movdqa %xmm6, 96(rp)
195 palignr($8, %xmm4, %xmm5)
196 movdqa %xmm5, 80(rp)
197 palignr($8, %xmm3, %xmm4)
198 movdqa %xmm4, 64(rp)
199 palignr($8, %xmm2, %xmm3)
200 movdqa %xmm3, 48(rp)
201 palignr($8, %xmm1, %xmm2)
202 movdqa %xmm2, 32(rp)
203 palignr($8, %xmm0, %xmm1)
204 movdqa %xmm1, 16(rp)
205 palignr($8, %xmm8, %xmm0)
206 movdqa %xmm0, (rp)
207 lea 128(rp), rp
209 IFDOS(` movdqa (%rsp), %xmm6 ')
210 IFDOS(` movdqa 16(%rsp), %xmm7 ')
211 IFDOS(` movdqa 32(%rsp), %xmm8 ')
212 IFDOS(` add $56, %rsp ')
214 L(ued0):test $8, R8(n)
215 jz 1f
216 movaps 56(up), %xmm3
217 movaps 40(up), %xmm2
218 movaps 24(up), %xmm1
219 movaps 8(up), %xmm0
220 movaps -8(up), %xmm4
221 palignr($8, %xmm2, %xmm3)
222 movdqa %xmm3, 48(rp)
223 palignr($8, %xmm1, %xmm2)
224 movdqa %xmm2, 32(rp)
225 palignr($8, %xmm0, %xmm1)
226 movdqa %xmm1, 16(rp)
227 palignr($8, %xmm4, %xmm0)
228 lea 64(up), up
229 movdqa %xmm0, (rp)
230 lea 64(rp), rp
232 1: test $4, R8(n)
233 jz 1f
234 movaps 24(up), %xmm1
235 movaps 8(up), %xmm0
236 palignr($8, %xmm0, %xmm1)
237 movaps -8(up), %xmm3
238 movdqa %xmm1, 16(rp)
239 palignr($8, %xmm3, %xmm0)
240 lea 32(up), up
241 movdqa %xmm0, (rp)
242 lea 32(rp), rp
244 1: test $2, R8(n)
245 jz 1f
246 movdqa 8(up), %xmm0
247 movdqa -8(up), %xmm3
248 palignr($8, %xmm3, %xmm0)
249 lea 16(up), up
250 movdqa %xmm0, (rp)
251 lea 16(rp), rp
253 1: test $1, R8(n)
254 jz 1f
255 mov (up), %r8
256 mov %r8, (rp)
258 1: FUNC_EXIT()
261 C Basecase code. Needed for good small operands speed, not for
262 C correctness as the above code is currently written.
264 L(bc): lea -8(rp), rp
265 sub $4, R32(n)
266 jc L(end)
268 ALIGN(16)
269 L(top): mov (up), %r8
270 mov 8(up), %r9
271 lea 32(rp), rp
272 mov 16(up), %r10
273 mov 24(up), %r11
274 lea 32(up), up
275 mov %r8, -24(rp)
276 mov %r9, -16(rp)
277 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
278 ` sub $4, R32(n)')
279 mov %r10, -8(rp)
280 mov %r11, (rp)
281 ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
282 ` jnc L(top)')
284 L(end): test $1, R8(n)
285 jz 1f
286 mov (up), %r8
287 mov %r8, 8(rp)
288 lea 8(rp), rp
289 lea 8(up), up
290 1: test $2, R8(n)
291 jz 1f
292 mov (up), %r8
293 mov 8(up), %r9
294 mov %r8, 8(rp)
295 mov %r9, 16(rp)
296 1: FUNC_EXIT()
298 EPILOGUE()