beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86_64 / fastsse / com-palignr.asm
blobc7155d115b689f79725cc6ada095a972b41e498b
1 dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
3 dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
5 dnl Contributed to the GNU project by Torbjorn Granlund.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb cycles/limb cycles/limb good
36 C aligned unaligned best seen for cpu?
37 C AMD K8,K9 2.0 illop 1.0/1.0 N
38 C AMD K10 0.85 illop Y/N
39 C AMD bull 1.39 ? 1.45 Y/N
40 C AMD pile 0.8-1.4 0.7-1.4 Y
41 C AMD steam
42 C AMD excavator
43 C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
44 C AMD jaguar 1.02 1.02 0.91/0.91 N
45 C Intel P4 2.26 illop Y/N
46 C Intel core 0.52 0.95 opt/0.74 Y
47 C Intel NHM 0.52 0.65 opt/opt Y
48 C Intel SBR 0.51 0.65 opt/opt Y
49 C Intel IBR 0.50 0.64 opt/0.57 Y
50 C Intel HWL 0.51 0.58 opt/opt Y
51 C Intel BWL 0.57 0.69 opt/0.65 Y
52 C Intel atom 1.16 1.70 opt/opt Y
53 C Intel SLM 1.02 1.52 N
54 C VIA nano 1.09 1.10 opt/opt Y
56 C We use only 16-byte operations, except for unaligned top-most and bottom-most
57 C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
58 C instruction is better adapted to mpn_copyd's needs, we need to contort the
59 C code to use it here.
61 C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
62 C from the x86_64 default code.
64 C INPUT PARAMETERS
65 define(`rp', `%rdi')
66 define(`up', `%rsi')
67 define(`n', `%rdx')
69 C There are three instructions for loading an aligned 128-bit quantity. We use
70 C movaps, since it has the shortest coding.
71 define(`movdqa', ``movaps'')
73 ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
75 ASM_START()
76 TEXT
77 ALIGN(64)
78 PROLOGUE(mpn_com)
79 FUNC_ENTRY(3)
81 cmp $COM_SSE_THRESHOLD, n
82 jbe L(bc)
84 pcmpeqb %xmm7, %xmm7 C set to 111...111
86 test $8, R8(rp) C is rp 16-byte aligned?
87 jz L(rp_aligned) C jump if rp aligned
89 mov (up), %r8
90 lea 8(up), up
91 not %r8
92 mov %r8, (rp)
93 lea 8(rp), rp
94 dec n
96 L(rp_aligned):
97 test $8, R8(up)
98 jnz L(uent)
100 ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
101 ` sub $8, n',
102 ` jmp L(am)')
104 ALIGN(16)
105 L(atop):movdqa 0(up), %xmm0
106 movdqa 16(up), %xmm1
107 movdqa 32(up), %xmm2
108 movdqa 48(up), %xmm3
109 lea 64(up), up
110 pxor %xmm7, %xmm0
111 pxor %xmm7, %xmm1
112 pxor %xmm7, %xmm2
113 pxor %xmm7, %xmm3
114 movdqa %xmm0, (rp)
115 movdqa %xmm1, 16(rp)
116 movdqa %xmm2, 32(rp)
117 movdqa %xmm3, 48(rp)
118 lea 64(rp), rp
119 L(am): sub $8, n
120 jnc L(atop)
122 test $4, R8(n)
123 jz 1f
124 movdqa (up), %xmm0
125 movdqa 16(up), %xmm1
126 lea 32(up), up
127 pxor %xmm7, %xmm0
128 pxor %xmm7, %xmm1
129 movdqa %xmm0, (rp)
130 movdqa %xmm1, 16(rp)
131 lea 32(rp), rp
133 1: test $2, R8(n)
134 jz 1f
135 movdqa (up), %xmm0
136 lea 16(up), up
137 pxor %xmm7, %xmm0
138 movdqa %xmm0, (rp)
139 lea 16(rp), rp
141 1: test $1, R8(n)
142 jz 1f
143 mov (up), %r8
144 not %r8
145 mov %r8, (rp)
147 1: FUNC_EXIT()
150 L(uent):
151 C Code handling up - rp = 8 (mod 16)
153 C FIXME: The code below only handles overlap if it is close to complete, or
154 C quite separate: up-rp < 5 or up-up > 15 limbs
155 lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
156 sub rp, %rax
157 cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
158 jbe L(bc) C deflect to plain loop
160 sub $16, n
161 jc L(uend)
163 movdqa 120(up), %xmm3
165 sub $16, n
166 jmp L(um)
168 ALIGN(16)
169 L(utop):movdqa 120(up), %xmm3
170 pxor %xmm7, %xmm0
171 movdqa %xmm0, -128(rp)
172 sub $16, n
173 L(um): movdqa 104(up), %xmm2
174 palignr($8, %xmm2, %xmm3)
175 movdqa 88(up), %xmm1
176 pxor %xmm7, %xmm3
177 movdqa %xmm3, 112(rp)
178 palignr($8, %xmm1, %xmm2)
179 movdqa 72(up), %xmm0
180 pxor %xmm7, %xmm2
181 movdqa %xmm2, 96(rp)
182 palignr($8, %xmm0, %xmm1)
183 movdqa 56(up), %xmm3
184 pxor %xmm7, %xmm1
185 movdqa %xmm1, 80(rp)
186 palignr($8, %xmm3, %xmm0)
187 movdqa 40(up), %xmm2
188 pxor %xmm7, %xmm0
189 movdqa %xmm0, 64(rp)
190 palignr($8, %xmm2, %xmm3)
191 movdqa 24(up), %xmm1
192 pxor %xmm7, %xmm3
193 movdqa %xmm3, 48(rp)
194 palignr($8, %xmm1, %xmm2)
195 movdqa 8(up), %xmm0
196 pxor %xmm7, %xmm2
197 movdqa %xmm2, 32(rp)
198 palignr($8, %xmm0, %xmm1)
199 movdqa -8(up), %xmm3
200 pxor %xmm7, %xmm1
201 movdqa %xmm1, 16(rp)
202 palignr($8, %xmm3, %xmm0)
203 lea 128(up), up
204 lea 128(rp), rp
205 jnc L(utop)
207 pxor %xmm7, %xmm0
208 movdqa %xmm0, -128(rp)
210 L(uend):test $8, R8(n)
211 jz 1f
212 movdqa 56(up), %xmm3
213 movdqa 40(up), %xmm2
214 palignr($8, %xmm2, %xmm3)
215 movdqa 24(up), %xmm1
216 pxor %xmm7, %xmm3
217 movdqa %xmm3, 48(rp)
218 palignr($8, %xmm1, %xmm2)
219 movdqa 8(up), %xmm0
220 pxor %xmm7, %xmm2
221 movdqa %xmm2, 32(rp)
222 palignr($8, %xmm0, %xmm1)
223 movdqa -8(up), %xmm3
224 pxor %xmm7, %xmm1
225 movdqa %xmm1, 16(rp)
226 palignr($8, %xmm3, %xmm0)
227 lea 64(up), up
228 pxor %xmm7, %xmm0
229 movdqa %xmm0, (rp)
230 lea 64(rp), rp
232 1: test $4, R8(n)
233 jz 1f
234 movdqa 24(up), %xmm1
235 movdqa 8(up), %xmm0
236 palignr($8, %xmm0, %xmm1)
237 movdqa -8(up), %xmm3
238 pxor %xmm7, %xmm1
239 movdqa %xmm1, 16(rp)
240 palignr($8, %xmm3, %xmm0)
241 lea 32(up), up
242 pxor %xmm7, %xmm0
243 movdqa %xmm0, (rp)
244 lea 32(rp), rp
246 1: test $2, R8(n)
247 jz 1f
248 movdqa 8(up), %xmm0
249 movdqa -8(up), %xmm3
250 palignr($8, %xmm3, %xmm0)
251 lea 16(up), up
252 pxor %xmm7, %xmm0
253 movdqa %xmm0, (rp)
254 lea 16(rp), rp
256 1: test $1, R8(n)
257 jz 1f
258 mov (up), %r8
259 not %r8
260 mov %r8, (rp)
262 1: FUNC_EXIT()
265 C Basecase code. Needed for good small operands speed, not for
266 C correctness as the above code is currently written.
268 L(bc): lea -8(rp), rp
269 sub $4, R32(n)
270 jc L(end)
272 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
273 ` ALIGN(16)')
274 L(top): mov (up), %r8
275 mov 8(up), %r9
276 lea 32(rp), rp
277 mov 16(up), %r10
278 mov 24(up), %r11
279 lea 32(up), up
280 not %r8
281 not %r9
282 not %r10
283 not %r11
284 mov %r8, -24(rp)
285 mov %r9, -16(rp)
286 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
287 ` sub $4, R32(n)')
288 mov %r10, -8(rp)
289 mov %r11, (rp)
290 ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
291 ` jnc L(top)')
293 L(end): test $1, R8(n)
294 jz 1f
295 mov (up), %r8
296 not %r8
297 mov %r8, 8(rp)
298 lea 8(rp), rp
299 lea 8(up), up
300 1: test $2, R8(n)
301 jz 1f
302 mov (up), %r8
303 mov 8(up), %r9
304 not %r8
305 not %r9
306 mov %r8, 8(rp)
307 mov %r9, 16(rp)
308 1: FUNC_EXIT()
310 EPILOGUE()