2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 #include "asm-syntax.h"
27 # define MEMCMP __memcmp_ssse3
30 #define CFI_PUSH(REG) \
31 cfi_adjust_cfa_offset (4); \
32 cfi_rel_offset (REG, 0)
34 #define CFI_POP(REG) \
35 cfi_adjust_cfa_offset (-4); \
38 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
39 #define POP(REG) popl REG; CFI_POP (REG)
45 #define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
46 #define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
48 .section .text.ssse3,"ax",@progbits
108 jae L(next_unaligned_table)
126 L(next_unaligned_table):
150 pcmpeqb (%edi), %xmm1
151 movaps 16(%esi), %xmm2
152 pcmpeqb 16(%edi), %xmm2
160 lea (%ecx, %edi,1), %eax
161 lea (%ecx, %esi,1), %edx
173 pcmpeqb (%edi), %xmm0
175 movdqa 16(%esi), %xmm2
176 pcmpeqb 16(%edi), %xmm2
177 L(shr_0_gobble_loop):
182 movdqa 32(%esi), %xmm0
183 movdqa 48(%esi), %xmm2
185 pcmpeqb 32(%edi), %xmm0
186 pcmpeqb 48(%edi), %xmm2
189 jz L(shr_0_gobble_loop)
193 jge L(shr_0_gobble_loop_next)
196 L(shr_0_gobble_loop_next):
206 lea (%ecx, %edi,1), %eax
207 lea (%ecx, %esi,1), %edx
221 movdqa 16(%esi), %xmm1
223 palignr $1,(%esi), %xmm1
224 pcmpeqb (%edi), %xmm1
226 movdqa 32(%esi), %xmm3
227 palignr $1,%xmm2, %xmm3
228 pcmpeqb 16(%edi), %xmm3
236 lea (%ecx, %edi,1), %eax
237 lea 1(%ecx, %esi,1), %edx
247 movdqa 16(%esi), %xmm0
248 palignr $1,(%esi), %xmm0
249 pcmpeqb (%edi), %xmm0
251 movdqa 32(%esi), %xmm3
252 palignr $1,16(%esi), %xmm3
253 pcmpeqb 16(%edi), %xmm3
255 L(shr_1_gobble_loop):
261 movdqa 64(%esi), %xmm3
262 palignr $1,48(%esi), %xmm3
264 movdqa 48(%esi), %xmm0
265 palignr $1,32(%esi), %xmm0
266 pcmpeqb 32(%edi), %xmm0
268 pcmpeqb 48(%edi), %xmm3
271 jz L(shr_1_gobble_loop)
275 jge L(shr_1_gobble_next)
278 L(shr_1_gobble_next):
289 lea (%ecx, %edi,1), %eax
290 lea 1(%ecx, %esi,1), %edx
305 movdqa 16(%esi), %xmm1
307 palignr $2,(%esi), %xmm1
308 pcmpeqb (%edi), %xmm1
310 movdqa 32(%esi), %xmm3
311 palignr $2,%xmm2, %xmm3
312 pcmpeqb 16(%edi), %xmm3
320 lea (%ecx, %edi,1), %eax
321 lea 2(%ecx, %esi,1), %edx
331 movdqa 16(%esi), %xmm0
332 palignr $2,(%esi), %xmm0
333 pcmpeqb (%edi), %xmm0
335 movdqa 32(%esi), %xmm3
336 palignr $2,16(%esi), %xmm3
337 pcmpeqb 16(%edi), %xmm3
339 L(shr_2_gobble_loop):
345 movdqa 64(%esi), %xmm3
346 palignr $2,48(%esi), %xmm3
348 movdqa 48(%esi), %xmm0
349 palignr $2,32(%esi), %xmm0
350 pcmpeqb 32(%edi), %xmm0
352 pcmpeqb 48(%edi), %xmm3
355 jz L(shr_2_gobble_loop)
359 jge L(shr_2_gobble_next)
362 L(shr_2_gobble_next):
373 lea (%ecx, %edi,1), %eax
374 lea 2(%ecx, %esi,1), %edx
388 movdqa 16(%esi), %xmm1
390 palignr $3,(%esi), %xmm1
391 pcmpeqb (%edi), %xmm1
393 movdqa 32(%esi), %xmm3
394 palignr $3,%xmm2, %xmm3
395 pcmpeqb 16(%edi), %xmm3
403 lea (%ecx, %edi,1), %eax
404 lea 3(%ecx, %esi,1), %edx
414 movdqa 16(%esi), %xmm0
415 palignr $3,(%esi), %xmm0
416 pcmpeqb (%edi), %xmm0
418 movdqa 32(%esi), %xmm3
419 palignr $3,16(%esi), %xmm3
420 pcmpeqb 16(%edi), %xmm3
422 L(shr_3_gobble_loop):
428 movdqa 64(%esi), %xmm3
429 palignr $3,48(%esi), %xmm3
431 movdqa 48(%esi), %xmm0
432 palignr $3,32(%esi), %xmm0
433 pcmpeqb 32(%edi), %xmm0
435 pcmpeqb 48(%edi), %xmm3
438 jz L(shr_3_gobble_loop)
442 jge L(shr_3_gobble_next)
445 L(shr_3_gobble_next):
456 lea (%ecx, %edi,1), %eax
457 lea 3(%ecx, %esi,1), %edx
471 movdqa 16(%esi), %xmm1
473 palignr $4,(%esi), %xmm1
474 pcmpeqb (%edi), %xmm1
476 movdqa 32(%esi), %xmm3
477 palignr $4,%xmm2, %xmm3
478 pcmpeqb 16(%edi), %xmm3
486 lea (%ecx, %edi,1), %eax
487 lea 4(%ecx, %esi,1), %edx
497 movdqa 16(%esi), %xmm0
498 palignr $4,(%esi), %xmm0
499 pcmpeqb (%edi), %xmm0
501 movdqa 32(%esi), %xmm3
502 palignr $4,16(%esi), %xmm3
503 pcmpeqb 16(%edi), %xmm3
505 L(shr_4_gobble_loop):
511 movdqa 64(%esi), %xmm3
512 palignr $4,48(%esi), %xmm3
514 movdqa 48(%esi), %xmm0
515 palignr $4,32(%esi), %xmm0
516 pcmpeqb 32(%edi), %xmm0
518 pcmpeqb 48(%edi), %xmm3
521 jz L(shr_4_gobble_loop)
525 jge L(shr_4_gobble_next)
528 L(shr_4_gobble_next):
539 lea (%ecx, %edi,1), %eax
540 lea 4(%ecx, %esi,1), %edx
554 movdqa 16(%esi), %xmm1
556 palignr $5,(%esi), %xmm1
557 pcmpeqb (%edi), %xmm1
559 movdqa 32(%esi), %xmm3
560 palignr $5,%xmm2, %xmm3
561 pcmpeqb 16(%edi), %xmm3
569 lea (%ecx, %edi,1), %eax
570 lea 5(%ecx, %esi,1), %edx
580 movdqa 16(%esi), %xmm0
581 palignr $5,(%esi), %xmm0
582 pcmpeqb (%edi), %xmm0
584 movdqa 32(%esi), %xmm3
585 palignr $5,16(%esi), %xmm3
586 pcmpeqb 16(%edi), %xmm3
588 L(shr_5_gobble_loop):
594 movdqa 64(%esi), %xmm3
595 palignr $5,48(%esi), %xmm3
597 movdqa 48(%esi), %xmm0
598 palignr $5,32(%esi), %xmm0
599 pcmpeqb 32(%edi), %xmm0
601 pcmpeqb 48(%edi), %xmm3
604 jz L(shr_5_gobble_loop)
608 jge L(shr_5_gobble_next)
611 L(shr_5_gobble_next):
622 lea (%ecx, %edi,1), %eax
623 lea 5(%ecx, %esi,1), %edx
637 movdqa 16(%esi), %xmm1
639 palignr $6,(%esi), %xmm1
640 pcmpeqb (%edi), %xmm1
642 movdqa 32(%esi), %xmm3
643 palignr $6,%xmm2, %xmm3
644 pcmpeqb 16(%edi), %xmm3
652 lea (%ecx, %edi,1), %eax
653 lea 6(%ecx, %esi,1), %edx
663 movdqa 16(%esi), %xmm0
664 palignr $6,(%esi), %xmm0
665 pcmpeqb (%edi), %xmm0
667 movdqa 32(%esi), %xmm3
668 palignr $6,16(%esi), %xmm3
669 pcmpeqb 16(%edi), %xmm3
671 L(shr_6_gobble_loop):
677 movdqa 64(%esi), %xmm3
678 palignr $6,48(%esi), %xmm3
680 movdqa 48(%esi), %xmm0
681 palignr $6,32(%esi), %xmm0
682 pcmpeqb 32(%edi), %xmm0
684 pcmpeqb 48(%edi), %xmm3
687 jz L(shr_6_gobble_loop)
691 jge L(shr_6_gobble_next)
694 L(shr_6_gobble_next):
705 lea (%ecx, %edi,1), %eax
706 lea 6(%ecx, %esi,1), %edx
720 movdqa 16(%esi), %xmm1
722 palignr $7,(%esi), %xmm1
723 pcmpeqb (%edi), %xmm1
725 movdqa 32(%esi), %xmm3
726 palignr $7,%xmm2, %xmm3
727 pcmpeqb 16(%edi), %xmm3
735 lea (%ecx, %edi,1), %eax
736 lea 7(%ecx, %esi,1), %edx
746 movdqa 16(%esi), %xmm0
747 palignr $7,(%esi), %xmm0
748 pcmpeqb (%edi), %xmm0
750 movdqa 32(%esi), %xmm3
751 palignr $7,16(%esi), %xmm3
752 pcmpeqb 16(%edi), %xmm3
754 L(shr_7_gobble_loop):
760 movdqa 64(%esi), %xmm3
761 palignr $7,48(%esi), %xmm3
763 movdqa 48(%esi), %xmm0
764 palignr $7,32(%esi), %xmm0
765 pcmpeqb 32(%edi), %xmm0
767 pcmpeqb 48(%edi), %xmm3
770 jz L(shr_7_gobble_loop)
774 jge L(shr_7_gobble_next)
777 L(shr_7_gobble_next):
788 lea (%ecx, %edi,1), %eax
789 lea 7(%ecx, %esi,1), %edx
803 movdqa 16(%esi), %xmm1
805 palignr $8,(%esi), %xmm1
806 pcmpeqb (%edi), %xmm1
808 movdqa 32(%esi), %xmm3
809 palignr $8,%xmm2, %xmm3
810 pcmpeqb 16(%edi), %xmm3
818 lea (%ecx, %edi,1), %eax
819 lea 8(%ecx, %esi,1), %edx
829 movdqa 16(%esi), %xmm0
830 palignr $8,(%esi), %xmm0
831 pcmpeqb (%edi), %xmm0
833 movdqa 32(%esi), %xmm3
834 palignr $8,16(%esi), %xmm3
835 pcmpeqb 16(%edi), %xmm3
837 L(shr_8_gobble_loop):
843 movdqa 64(%esi), %xmm3
844 palignr $8,48(%esi), %xmm3
846 movdqa 48(%esi), %xmm0
847 palignr $8,32(%esi), %xmm0
848 pcmpeqb 32(%edi), %xmm0
850 pcmpeqb 48(%edi), %xmm3
853 jz L(shr_8_gobble_loop)
857 jge L(shr_8_gobble_next)
860 L(shr_8_gobble_next):
871 lea (%ecx, %edi,1), %eax
872 lea 8(%ecx, %esi,1), %edx
886 movdqa 16(%esi), %xmm1
888 palignr $9,(%esi), %xmm1
889 pcmpeqb (%edi), %xmm1
891 movdqa 32(%esi), %xmm3
892 palignr $9,%xmm2, %xmm3
893 pcmpeqb 16(%edi), %xmm3
901 lea (%ecx, %edi,1), %eax
902 lea 9(%ecx, %esi,1), %edx
912 movdqa 16(%esi), %xmm0
913 palignr $9,(%esi), %xmm0
914 pcmpeqb (%edi), %xmm0
916 movdqa 32(%esi), %xmm3
917 palignr $9,16(%esi), %xmm3
918 pcmpeqb 16(%edi), %xmm3
920 L(shr_9_gobble_loop):
926 movdqa 64(%esi), %xmm3
927 palignr $9,48(%esi), %xmm3
929 movdqa 48(%esi), %xmm0
930 palignr $9,32(%esi), %xmm0
931 pcmpeqb 32(%edi), %xmm0
933 pcmpeqb 48(%edi), %xmm3
936 jz L(shr_9_gobble_loop)
940 jge L(shr_9_gobble_next)
943 L(shr_9_gobble_next):
954 lea (%ecx, %edi,1), %eax
955 lea 9(%ecx, %esi,1), %edx
969 movdqa 16(%esi), %xmm1
971 palignr $10, (%esi), %xmm1
972 pcmpeqb (%edi), %xmm1
974 movdqa 32(%esi), %xmm3
975 palignr $10,%xmm2, %xmm3
976 pcmpeqb 16(%edi), %xmm3
984 lea (%ecx, %edi,1), %eax
985 lea 10(%ecx, %esi,1), %edx
995 movdqa 16(%esi), %xmm0
996 palignr $10, (%esi), %xmm0
997 pcmpeqb (%edi), %xmm0
999 movdqa 32(%esi), %xmm3
1000 palignr $10, 16(%esi), %xmm3
1001 pcmpeqb 16(%edi), %xmm3
1003 L(shr_10_gobble_loop):
1006 pmovmskb %xmm3, %edx
1009 movdqa 64(%esi), %xmm3
1010 palignr $10,48(%esi), %xmm3
1012 movdqa 48(%esi), %xmm0
1013 palignr $10,32(%esi), %xmm0
1014 pcmpeqb 32(%edi), %xmm0
1016 pcmpeqb 48(%edi), %xmm3
1019 jz L(shr_10_gobble_loop)
1023 jge L(shr_10_gobble_next)
1026 L(shr_10_gobble_next):
1030 pmovmskb %xmm3, %edx
1037 lea (%ecx, %edi,1), %eax
1038 lea 10(%ecx, %esi,1), %edx
1050 jae L(shr_11_gobble)
1052 movdqa 16(%esi), %xmm1
1054 palignr $11, (%esi), %xmm1
1055 pcmpeqb (%edi), %xmm1
1057 movdqa 32(%esi), %xmm3
1058 palignr $11, %xmm2, %xmm3
1059 pcmpeqb 16(%edi), %xmm3
1062 pmovmskb %xmm3, %edx
1067 lea (%ecx, %edi,1), %eax
1068 lea 11(%ecx, %esi,1), %edx
1078 movdqa 16(%esi), %xmm0
1079 palignr $11, (%esi), %xmm0
1080 pcmpeqb (%edi), %xmm0
1082 movdqa 32(%esi), %xmm3
1083 palignr $11, 16(%esi), %xmm3
1084 pcmpeqb 16(%edi), %xmm3
1086 L(shr_11_gobble_loop):
1089 pmovmskb %xmm3, %edx
1092 movdqa 64(%esi), %xmm3
1093 palignr $11,48(%esi), %xmm3
1095 movdqa 48(%esi), %xmm0
1096 palignr $11,32(%esi), %xmm0
1097 pcmpeqb 32(%edi), %xmm0
1099 pcmpeqb 48(%edi), %xmm3
1102 jz L(shr_11_gobble_loop)
1106 jge L(shr_11_gobble_next)
1109 L(shr_11_gobble_next):
1113 pmovmskb %xmm3, %edx
1120 lea (%ecx, %edi,1), %eax
1121 lea 11(%ecx, %esi,1), %edx
1133 jae L(shr_12_gobble)
1135 movdqa 16(%esi), %xmm1
1137 palignr $12, (%esi), %xmm1
1138 pcmpeqb (%edi), %xmm1
1140 movdqa 32(%esi), %xmm3
1141 palignr $12, %xmm2, %xmm3
1142 pcmpeqb 16(%edi), %xmm3
1145 pmovmskb %xmm3, %edx
1150 lea (%ecx, %edi,1), %eax
1151 lea 12(%ecx, %esi,1), %edx
1161 movdqa 16(%esi), %xmm0
1162 palignr $12, (%esi), %xmm0
1163 pcmpeqb (%edi), %xmm0
1165 movdqa 32(%esi), %xmm3
1166 palignr $12, 16(%esi), %xmm3
1167 pcmpeqb 16(%edi), %xmm3
1169 L(shr_12_gobble_loop):
1172 pmovmskb %xmm3, %edx
1175 movdqa 64(%esi), %xmm3
1176 palignr $12,48(%esi), %xmm3
1178 movdqa 48(%esi), %xmm0
1179 palignr $12,32(%esi), %xmm0
1180 pcmpeqb 32(%edi), %xmm0
1182 pcmpeqb 48(%edi), %xmm3
1185 jz L(shr_12_gobble_loop)
1189 jge L(shr_12_gobble_next)
1192 L(shr_12_gobble_next):
1196 pmovmskb %xmm3, %edx
1203 lea (%ecx, %edi,1), %eax
1204 lea 12(%ecx, %esi,1), %edx
1216 jae L(shr_13_gobble)
1218 movdqa 16(%esi), %xmm1
1220 palignr $13, (%esi), %xmm1
1221 pcmpeqb (%edi), %xmm1
1223 movdqa 32(%esi), %xmm3
1224 palignr $13, %xmm2, %xmm3
1225 pcmpeqb 16(%edi), %xmm3
1228 pmovmskb %xmm3, %edx
1233 lea (%ecx, %edi,1), %eax
1234 lea 13(%ecx, %esi,1), %edx
1244 movdqa 16(%esi), %xmm0
1245 palignr $13, (%esi), %xmm0
1246 pcmpeqb (%edi), %xmm0
1248 movdqa 32(%esi), %xmm3
1249 palignr $13, 16(%esi), %xmm3
1250 pcmpeqb 16(%edi), %xmm3
1252 L(shr_13_gobble_loop):
1255 pmovmskb %xmm3, %edx
1258 movdqa 64(%esi), %xmm3
1259 palignr $13,48(%esi), %xmm3
1261 movdqa 48(%esi), %xmm0
1262 palignr $13,32(%esi), %xmm0
1263 pcmpeqb 32(%edi), %xmm0
1265 pcmpeqb 48(%edi), %xmm3
1268 jz L(shr_13_gobble_loop)
1272 jge L(shr_13_gobble_next)
1275 L(shr_13_gobble_next):
1279 pmovmskb %xmm3, %edx
1286 lea (%ecx, %edi,1), %eax
1287 lea 13(%ecx, %esi,1), %edx
1299 jae L(shr_14_gobble)
1301 movdqa 16(%esi), %xmm1
1303 palignr $14, (%esi), %xmm1
1304 pcmpeqb (%edi), %xmm1
1306 movdqa 32(%esi), %xmm3
1307 palignr $14, %xmm2, %xmm3
1308 pcmpeqb 16(%edi), %xmm3
1311 pmovmskb %xmm3, %edx
1316 lea (%ecx, %edi,1), %eax
1317 lea 14(%ecx, %esi,1), %edx
1327 movdqa 16(%esi), %xmm0
1328 palignr $14, (%esi), %xmm0
1329 pcmpeqb (%edi), %xmm0
1331 movdqa 32(%esi), %xmm3
1332 palignr $14, 16(%esi), %xmm3
1333 pcmpeqb 16(%edi), %xmm3
1335 L(shr_14_gobble_loop):
1338 pmovmskb %xmm3, %edx
1341 movdqa 64(%esi), %xmm3
1342 palignr $14,48(%esi), %xmm3
1344 movdqa 48(%esi), %xmm0
1345 palignr $14,32(%esi), %xmm0
1346 pcmpeqb 32(%edi), %xmm0
1348 pcmpeqb 48(%edi), %xmm3
1351 jz L(shr_14_gobble_loop)
1355 jge L(shr_14_gobble_next)
1358 L(shr_14_gobble_next):
1362 pmovmskb %xmm3, %edx
1369 lea (%ecx, %edi,1), %eax
1370 lea 14(%ecx, %esi,1), %edx
1382 jae L(shr_15_gobble)
1384 movdqa 16(%esi), %xmm1
1386 palignr $15, (%esi), %xmm1
1387 pcmpeqb (%edi), %xmm1
1389 movdqa 32(%esi), %xmm3
1390 palignr $15, %xmm2, %xmm3
1391 pcmpeqb 16(%edi), %xmm3
1394 pmovmskb %xmm3, %edx
1399 lea (%ecx, %edi,1), %eax
1400 lea 15(%ecx, %esi,1), %edx
1410 movdqa 16(%esi), %xmm0
1411 palignr $15, (%esi), %xmm0
1412 pcmpeqb (%edi), %xmm0
1414 movdqa 32(%esi), %xmm3
1415 palignr $15, 16(%esi), %xmm3
1416 pcmpeqb 16(%edi), %xmm3
1418 L(shr_15_gobble_loop):
1421 pmovmskb %xmm3, %edx
1424 movdqa 64(%esi), %xmm3
1425 palignr $15,48(%esi), %xmm3
1427 movdqa 48(%esi), %xmm0
1428 palignr $15,32(%esi), %xmm0
1429 pcmpeqb 32(%edi), %xmm0
1431 pcmpeqb 48(%edi), %xmm3
1434 jz L(shr_15_gobble_loop)
1438 jge L(shr_15_gobble_next)
1441 L(shr_15_gobble_next):
1445 pmovmskb %xmm3, %edx
1452 lea (%ecx, %edi,1), %eax
1453 lea 15(%ecx, %esi,1), %edx
1462 pmovmskb %xmm1, %ebx
1495 movzbl -9(%edi), %eax
1496 movzbl -9(%esi), %edx
1502 movzbl -16(%edi), %eax
1503 movzbl -16(%esi), %edx
1509 movzbl -15(%edi), %eax
1510 movzbl -15(%esi), %edx
1516 movzbl -14(%edi), %eax
1517 movzbl -14(%esi), %edx
1523 movzbl -13(%edi), %eax
1524 movzbl -13(%esi), %edx
1530 movzbl -12(%edi), %eax
1531 movzbl -12(%esi), %edx
1537 movzbl -11(%edi), %eax
1538 movzbl -11(%esi), %edx
1544 movzbl -10(%edi), %eax
1545 movzbl -10(%esi), %edx
1576 movzbl -9(%edi), %eax
1577 movzbl -9(%esi), %edx
1813 movzbl -1(%eax), %ecx
1878 movzwl -2(%eax), %ecx
1879 movzwl -2(%edx), %ebx
1891 movl -47(%eax), %ecx
1892 movl -47(%edx), %ebx
1896 movl -43(%eax), %ecx
1897 movl -43(%edx), %ebx
1901 movl -39(%eax), %ecx
1902 movl -39(%edx), %ebx
1906 movl -35(%eax), %ecx
1907 movl -35(%edx), %ebx
1911 movl -31(%eax), %ecx
1912 movl -31(%edx), %ebx
1916 movl -27(%eax), %ecx
1917 movl -27(%edx), %ebx
1921 movl -23(%eax), %ecx
1922 movl -23(%edx), %ebx
1926 movl -19(%eax), %ecx
1927 movl -19(%edx), %ebx
1931 movl -15(%eax), %ecx
1932 movl -15(%edx), %ebx
1936 movl -11(%eax), %ecx
1937 movl -11(%edx), %ebx
1946 movzwl -3(%eax), %ecx
1947 movzwl -3(%edx), %ebx
1952 movzbl -1(%eax), %eax