2 Copyright (C) 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
23 # define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
27 # define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
31 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
32 # define POP(REG) popl REG; CFI_POP (REG)
35 # define RETURN POP (%edi); ret; CFI_PUSH (%edi)
41 ENTRY (__wcscpy_ssse3)
70 jnz L(CopyFrom1To16Bytes)
91 movaps 16(%ecx), %xmm2
98 jnz L(CopyFrom1To16Bytes)
100 movaps 16(%ecx, %esi), %xmm3
101 movaps %xmm2, (%edx, %esi)
107 jnz L(CopyFrom1To16Bytes)
109 movaps 16(%ecx, %esi), %xmm4
110 movaps %xmm3, (%edx, %esi)
116 jnz L(CopyFrom1To16Bytes)
118 movaps 16(%ecx, %esi), %xmm1
119 movaps %xmm4, (%edx, %esi)
125 jnz L(CopyFrom1To16Bytes)
127 movaps 16(%ecx, %esi), %xmm2
128 movaps %xmm1, (%edx, %esi)
134 jnz L(CopyFrom1To16Bytes)
136 movaps 16(%ecx, %esi), %xmm3
137 movaps %xmm2, (%edx, %esi)
143 jnz L(CopyFrom1To16Bytes)
145 movaps %xmm3, (%edx, %esi)
147 lea 16(%ecx, %esi), %ecx
156 movaps 32(%ecx), %xmm3
158 movaps 16(%ecx), %xmm5
160 movaps 48(%ecx), %xmm7
170 jnz L(Aligned64Leave)
171 movaps %xmm4, -64(%edx)
172 movaps %xmm5, -48(%edx)
173 movaps %xmm6, -32(%edx)
174 movaps %xmm7, -16(%edx)
181 jnz L(CopyFrom1To16Bytes)
185 movaps %xmm4, -64(%edx)
188 jnz L(CopyFrom1To16Bytes)
192 movaps %xmm5, -48(%edx)
195 jnz L(CopyFrom1To16Bytes)
197 movaps %xmm6, -32(%edx)
202 jnz L(CopyFrom1To16Bytes)
205 movaps %xmm7, -16(%edx)
210 movaps -4(%ecx), %xmm1
211 movaps 12(%ecx), %xmm2
220 palignr $4, %xmm1, %xmm2
222 movaps 28(%ecx), %xmm2
233 palignr $4, %xmm3, %xmm2
235 movaps 28(%ecx), %xmm2
246 palignr $4, %xmm1, %xmm2
248 movaps 28(%ecx), %xmm2
258 palignr $4, %xmm3, %xmm2
269 movaps -4(%ecx), %xmm1
272 movaps 12(%ecx), %xmm2
273 movaps 28(%ecx), %xmm3
275 movaps 44(%ecx), %xmm4
277 movaps 60(%ecx), %xmm5
284 palignr $4, %xmm4, %xmm5
286 palignr $4, %xmm3, %xmm4
289 palignr $4, %xmm2, %xmm3
291 palignr $4, %xmm1, %xmm2
293 movaps %xmm5, 48(%edx)
294 movaps %xmm4, 32(%edx)
295 movaps %xmm3, 16(%edx)
321 movaps -8(%ecx), %xmm1
322 movaps 8(%ecx), %xmm2
331 palignr $8, %xmm1, %xmm2
333 movaps 24(%ecx), %xmm2
344 palignr $8, %xmm3, %xmm2
346 movaps 24(%ecx), %xmm2
357 palignr $8, %xmm1, %xmm2
359 movaps 24(%ecx), %xmm2
369 palignr $8, %xmm3, %xmm2
380 movaps -8(%ecx), %xmm1
383 movaps 8(%ecx), %xmm2
384 movaps 24(%ecx), %xmm3
386 movaps 40(%ecx), %xmm4
388 movaps 56(%ecx), %xmm5
395 palignr $8, %xmm4, %xmm5
397 palignr $8, %xmm3, %xmm4
400 palignr $8, %xmm2, %xmm3
402 palignr $8, %xmm1, %xmm2
404 movaps %xmm5, 48(%edx)
405 movaps %xmm4, 32(%edx)
406 movaps %xmm3, 16(%edx)
430 movaps -12(%ecx), %xmm1
431 movaps 4(%ecx), %xmm2
440 palignr $12, %xmm1, %xmm2
442 movaps 20(%ecx), %xmm2
453 palignr $12, %xmm3, %xmm2
455 movaps 20(%ecx), %xmm2
466 palignr $12, %xmm1, %xmm2
468 movaps 20(%ecx), %xmm2
478 palignr $12, %xmm3, %xmm2
489 movaps -12(%ecx), %xmm1
492 movaps 4(%ecx), %xmm2
493 movaps 20(%ecx), %xmm3
495 movaps 36(%ecx), %xmm4
497 movaps 52(%ecx), %xmm5
504 palignr $12, %xmm4, %xmm5
506 palignr $12, %xmm3, %xmm4
509 palignr $12, %xmm2, %xmm3
511 palignr $12, %xmm1, %xmm2
513 movaps %xmm5, 48(%edx)
514 movaps %xmm4, 32(%edx)
515 movaps %xmm3, 16(%edx)
518 jmp L(Shl12LoopStart)
526 L(CopyFrom1To16Bytes):