2 Copyright (C) 2011-2015 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 .section .text.ssse3,"ax",@progbits
24 ENTRY (__wcscpy_ssse3)
53 jnz L(CopyFrom1To16Bytes)
64 /* case: rcx_offset == rdx_offset */
76 movaps 16(%rcx), %xmm2
83 jnz L(CopyFrom1To16Bytes)
85 movaps 16(%rcx, %rsi), %xmm3
86 movaps %xmm2, (%rdx, %rsi)
92 jnz L(CopyFrom1To16Bytes)
94 movaps 16(%rcx, %rsi), %xmm4
95 movaps %xmm3, (%rdx, %rsi)
101 jnz L(CopyFrom1To16Bytes)
103 movaps 16(%rcx, %rsi), %xmm1
104 movaps %xmm4, (%rdx, %rsi)
110 jnz L(CopyFrom1To16Bytes)
112 movaps 16(%rcx, %rsi), %xmm2
113 movaps %xmm1, (%rdx, %rsi)
119 jnz L(CopyFrom1To16Bytes)
121 movaps 16(%rcx, %rsi), %xmm3
122 movaps %xmm2, (%rdx, %rsi)
128 jnz L(CopyFrom1To16Bytes)
130 movaps %xmm3, (%rdx, %rsi)
132 lea 16(%rcx, %rsi), %rcx
143 movaps 16(%rcx), %xmm5
144 movaps 32(%rcx), %xmm3
146 movaps 48(%rcx), %xmm7
155 jnz L(Aligned64Leave)
156 movaps %xmm4, -64(%rdx)
157 movaps %xmm5, -48(%rdx)
158 movaps %xmm6, -32(%rdx)
159 movaps %xmm7, -16(%rdx)
166 jnz L(CopyFrom1To16Bytes)
171 movaps %xmm4, -64(%rdx)
174 jnz L(CopyFrom1To16Bytes)
179 movaps %xmm5, -48(%rdx)
182 jnz L(CopyFrom1To16Bytes)
184 movaps %xmm6, -32(%rdx)
190 jnz L(CopyFrom1To16Bytes)
193 movaps %xmm7, -16(%rdx)
198 movaps -4(%rcx), %xmm1
199 movaps 12(%rcx), %xmm2
208 palignr $4, %xmm1, %xmm2
210 movaps 28(%rcx), %xmm2
221 palignr $4, %xmm3, %xmm2
223 movaps 28(%rcx), %xmm2
234 palignr $4, %xmm1, %xmm2
236 movaps 28(%rcx), %xmm2
246 palignr $4, %xmm3, %xmm2
257 movaps -4(%rcx), %xmm1
261 movaps 12(%rcx), %xmm2
262 movaps 28(%rcx), %xmm3
264 movaps 44(%rcx), %xmm4
266 movaps 60(%rcx), %xmm5
273 palignr $4, %xmm4, %xmm5
275 palignr $4, %xmm3, %xmm4
278 palignr $4, %xmm2, %xmm3
280 palignr $4, %xmm1, %xmm2
282 movaps %xmm5, 48(%rdx)
283 movaps %xmm4, 32(%rdx)
284 movaps %xmm3, 16(%rdx)
290 movdqu -4(%rcx), %xmm1
292 movdqu %xmm1, -4(%rdx)
293 jmp L(CopyFrom1To16Bytes)
297 movaps -8(%rcx), %xmm1
298 movaps 8(%rcx), %xmm2
307 palignr $8, %xmm1, %xmm2
309 movaps 24(%rcx), %xmm2
320 palignr $8, %xmm3, %xmm2
322 movaps 24(%rcx), %xmm2
333 palignr $8, %xmm1, %xmm2
335 movaps 24(%rcx), %xmm2
345 palignr $8, %xmm3, %xmm2
356 movaps -8(%rcx), %xmm1
360 movaps 8(%rcx), %xmm2
361 movaps 24(%rcx), %xmm3
363 movaps 40(%rcx), %xmm4
365 movaps 56(%rcx), %xmm5
372 palignr $8, %xmm4, %xmm5
374 palignr $8, %xmm3, %xmm4
377 palignr $8, %xmm2, %xmm3
379 palignr $8, %xmm1, %xmm2
381 movaps %xmm5, 48(%rdx)
382 movaps %xmm4, 32(%rdx)
383 movaps %xmm3, 16(%rdx)
392 jmp L(CopyFrom1To16Bytes)
396 movaps -12(%rcx), %xmm1
397 movaps 4(%rcx), %xmm2
406 palignr $12, %xmm1, %xmm2
408 movaps 20(%rcx), %xmm2
419 palignr $12, %xmm3, %xmm2
421 movaps 20(%rcx), %xmm2
432 palignr $12, %xmm1, %xmm2
434 movaps 20(%rcx), %xmm2
444 palignr $12, %xmm3, %xmm2
455 movaps -12(%rcx), %xmm1
459 movaps 4(%rcx), %xmm2
460 movaps 20(%rcx), %xmm3
462 movaps 36(%rcx), %xmm4
464 movaps 52(%rcx), %xmm5
471 palignr $12, %xmm4, %xmm5
473 palignr $12, %xmm3, %xmm4
475 palignr $12, %xmm2, %xmm3
477 palignr $12, %xmm1, %xmm2
479 movaps %xmm5, 48(%rdx)
480 movaps %xmm4, 32(%rdx)
481 movaps %xmm3, 16(%rdx)
484 jmp L(Shl12LoopStart)
490 jmp L(CopyFrom1To16Bytes)
493 L(CopyFrom1To16Bytes):