2 Copyright (C) 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 #include "asm-syntax.h"
27 # define MEMCMP __memcmp_ssse3
30 #define CFI_PUSH(REG) \
31 cfi_adjust_cfa_offset (4); \
32 cfi_rel_offset (REG, 0)
34 #define CFI_POP(REG) \
35 cfi_adjust_cfa_offset (-4); \
38 #define PUSH(REG) pushl REG; CFI_PUSH (REG)
39 #define POP(REG) popl REG; CFI_POP (REG)
45 #define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
46 #define RETURN RETURN_END; CFI_PUSH (%ebx); CFI_PUSH (%edi); \
49 .section .text.ssse3,"ax",@progbits
107 jae L(next_unaligned_table)
125 L(next_unaligned_table):
149 pcmpeqb (%edi), %xmm1
150 movaps 16(%esi), %xmm2
151 pcmpeqb 16(%edi), %xmm2
160 lea (%ecx, %edi,1), %eax
161 lea (%ecx, %esi,1), %edx
171 pcmpeqb (%edi), %xmm0
173 movdqa 16(%esi), %xmm2
174 pcmpeqb 16(%edi), %xmm2
175 L(shr_0_gobble_loop):
180 movdqa 32(%esi), %xmm0
181 movdqa 48(%esi), %xmm2
183 pcmpeqb 32(%edi), %xmm0
184 pcmpeqb 48(%edi), %xmm2
187 jz L(shr_0_gobble_loop)
191 jge L(shr_0_gobble_loop_next)
194 L(shr_0_gobble_loop_next):
204 lea (%ecx, %edi,1), %eax
205 lea (%ecx, %esi,1), %edx
217 movdqa 16(%esi), %xmm1
219 palignr $1,(%esi), %xmm1
220 pcmpeqb (%edi), %xmm1
222 movdqa 32(%esi), %xmm3
223 palignr $1,%xmm2, %xmm3
224 pcmpeqb 16(%edi), %xmm3
232 lea (%ecx, %edi,1), %eax
233 lea 1(%ecx, %esi,1), %edx
241 movdqa 16(%esi), %xmm0
242 palignr $1,(%esi), %xmm0
243 pcmpeqb (%edi), %xmm0
245 movdqa 32(%esi), %xmm3
246 palignr $1,16(%esi), %xmm3
247 pcmpeqb 16(%edi), %xmm3
249 L(shr_1_gobble_loop):
255 movdqa 64(%esi), %xmm3
256 palignr $1,48(%esi), %xmm3
258 movdqa 48(%esi), %xmm0
259 palignr $1,32(%esi), %xmm0
260 pcmpeqb 32(%edi), %xmm0
262 pcmpeqb 48(%edi), %xmm3
265 jz L(shr_1_gobble_loop)
268 jge L(shr_1_gobble_next)
271 L(shr_1_gobble_next):
282 lea (%ecx, %edi,1), %eax
283 lea 1(%ecx, %esi,1), %edx
296 movdqa 16(%esi), %xmm1
298 palignr $2,(%esi), %xmm1
299 pcmpeqb (%edi), %xmm1
301 movdqa 32(%esi), %xmm3
302 palignr $2,%xmm2, %xmm3
303 pcmpeqb 16(%edi), %xmm3
311 lea (%ecx, %edi,1), %eax
312 lea 2(%ecx, %esi,1), %edx
320 movdqa 16(%esi), %xmm0
321 palignr $2,(%esi), %xmm0
322 pcmpeqb (%edi), %xmm0
324 movdqa 32(%esi), %xmm3
325 palignr $2,16(%esi), %xmm3
326 pcmpeqb 16(%edi), %xmm3
328 L(shr_2_gobble_loop):
334 movdqa 64(%esi), %xmm3
335 palignr $2,48(%esi), %xmm3
337 movdqa 48(%esi), %xmm0
338 palignr $2,32(%esi), %xmm0
339 pcmpeqb 32(%edi), %xmm0
341 pcmpeqb 48(%edi), %xmm3
344 jz L(shr_2_gobble_loop)
347 jge L(shr_2_gobble_next)
350 L(shr_2_gobble_next):
361 lea (%ecx, %edi,1), %eax
362 lea 2(%ecx, %esi,1), %edx
374 movdqa 16(%esi), %xmm1
376 palignr $3,(%esi), %xmm1
377 pcmpeqb (%edi), %xmm1
379 movdqa 32(%esi), %xmm3
380 palignr $3,%xmm2, %xmm3
381 pcmpeqb 16(%edi), %xmm3
389 lea (%ecx, %edi,1), %eax
390 lea 3(%ecx, %esi,1), %edx
398 movdqa 16(%esi), %xmm0
399 palignr $3,(%esi), %xmm0
400 pcmpeqb (%edi), %xmm0
402 movdqa 32(%esi), %xmm3
403 palignr $3,16(%esi), %xmm3
404 pcmpeqb 16(%edi), %xmm3
406 L(shr_3_gobble_loop):
412 movdqa 64(%esi), %xmm3
413 palignr $3,48(%esi), %xmm3
415 movdqa 48(%esi), %xmm0
416 palignr $3,32(%esi), %xmm0
417 pcmpeqb 32(%edi), %xmm0
419 pcmpeqb 48(%edi), %xmm3
422 jz L(shr_3_gobble_loop)
425 jge L(shr_3_gobble_next)
428 L(shr_3_gobble_next):
439 lea (%ecx, %edi,1), %eax
440 lea 3(%ecx, %esi,1), %edx
452 movdqa 16(%esi), %xmm1
454 palignr $4,(%esi), %xmm1
455 pcmpeqb (%edi), %xmm1
457 movdqa 32(%esi), %xmm3
458 palignr $4,%xmm2, %xmm3
459 pcmpeqb 16(%edi), %xmm3
467 lea (%ecx, %edi,1), %eax
468 lea 4(%ecx, %esi,1), %edx
476 movdqa 16(%esi), %xmm0
477 palignr $4,(%esi), %xmm0
478 pcmpeqb (%edi), %xmm0
480 movdqa 32(%esi), %xmm3
481 palignr $4,16(%esi), %xmm3
482 pcmpeqb 16(%edi), %xmm3
484 L(shr_4_gobble_loop):
490 movdqa 64(%esi), %xmm3
491 palignr $4,48(%esi), %xmm3
493 movdqa 48(%esi), %xmm0
494 palignr $4,32(%esi), %xmm0
495 pcmpeqb 32(%edi), %xmm0
497 pcmpeqb 48(%edi), %xmm3
500 jz L(shr_4_gobble_loop)
503 jge L(shr_4_gobble_next)
506 L(shr_4_gobble_next):
517 lea (%ecx, %edi,1), %eax
518 lea 4(%ecx, %esi,1), %edx
530 movdqa 16(%esi), %xmm1
532 palignr $5,(%esi), %xmm1
533 pcmpeqb (%edi), %xmm1
535 movdqa 32(%esi), %xmm3
536 palignr $5,%xmm2, %xmm3
537 pcmpeqb 16(%edi), %xmm3
545 lea (%ecx, %edi,1), %eax
546 lea 5(%ecx, %esi,1), %edx
554 movdqa 16(%esi), %xmm0
555 palignr $5,(%esi), %xmm0
556 pcmpeqb (%edi), %xmm0
558 movdqa 32(%esi), %xmm3
559 palignr $5,16(%esi), %xmm3
560 pcmpeqb 16(%edi), %xmm3
562 L(shr_5_gobble_loop):
568 movdqa 64(%esi), %xmm3
569 palignr $5,48(%esi), %xmm3
571 movdqa 48(%esi), %xmm0
572 palignr $5,32(%esi), %xmm0
573 pcmpeqb 32(%edi), %xmm0
575 pcmpeqb 48(%edi), %xmm3
578 jz L(shr_5_gobble_loop)
581 jge L(shr_5_gobble_next)
584 L(shr_5_gobble_next):
595 lea (%ecx, %edi,1), %eax
596 lea 5(%ecx, %esi,1), %edx
608 movdqa 16(%esi), %xmm1
610 palignr $6,(%esi), %xmm1
611 pcmpeqb (%edi), %xmm1
613 movdqa 32(%esi), %xmm3
614 palignr $6,%xmm2, %xmm3
615 pcmpeqb 16(%edi), %xmm3
623 lea (%ecx, %edi,1), %eax
624 lea 6(%ecx, %esi,1), %edx
632 movdqa 16(%esi), %xmm0
633 palignr $6,(%esi), %xmm0
634 pcmpeqb (%edi), %xmm0
636 movdqa 32(%esi), %xmm3
637 palignr $6,16(%esi), %xmm3
638 pcmpeqb 16(%edi), %xmm3
640 L(shr_6_gobble_loop):
646 movdqa 64(%esi), %xmm3
647 palignr $6,48(%esi), %xmm3
649 movdqa 48(%esi), %xmm0
650 palignr $6,32(%esi), %xmm0
651 pcmpeqb 32(%edi), %xmm0
653 pcmpeqb 48(%edi), %xmm3
656 jz L(shr_6_gobble_loop)
659 jge L(shr_6_gobble_next)
662 L(shr_6_gobble_next):
673 lea (%ecx, %edi,1), %eax
674 lea 6(%ecx, %esi,1), %edx
686 movdqa 16(%esi), %xmm1
688 palignr $7,(%esi), %xmm1
689 pcmpeqb (%edi), %xmm1
691 movdqa 32(%esi), %xmm3
692 palignr $7,%xmm2, %xmm3
693 pcmpeqb 16(%edi), %xmm3
701 lea (%ecx, %edi,1), %eax
702 lea 7(%ecx, %esi,1), %edx
710 movdqa 16(%esi), %xmm0
711 palignr $7,(%esi), %xmm0
712 pcmpeqb (%edi), %xmm0
714 movdqa 32(%esi), %xmm3
715 palignr $7,16(%esi), %xmm3
716 pcmpeqb 16(%edi), %xmm3
718 L(shr_7_gobble_loop):
724 movdqa 64(%esi), %xmm3
725 palignr $7,48(%esi), %xmm3
727 movdqa 48(%esi), %xmm0
728 palignr $7,32(%esi), %xmm0
729 pcmpeqb 32(%edi), %xmm0
731 pcmpeqb 48(%edi), %xmm3
734 jz L(shr_7_gobble_loop)
737 jge L(shr_7_gobble_next)
740 L(shr_7_gobble_next):
751 lea (%ecx, %edi,1), %eax
752 lea 7(%ecx, %esi,1), %edx
764 movdqa 16(%esi), %xmm1
766 palignr $8,(%esi), %xmm1
767 pcmpeqb (%edi), %xmm1
769 movdqa 32(%esi), %xmm3
770 palignr $8,%xmm2, %xmm3
771 pcmpeqb 16(%edi), %xmm3
779 lea (%ecx, %edi,1), %eax
780 lea 8(%ecx, %esi,1), %edx
788 movdqa 16(%esi), %xmm0
789 palignr $8,(%esi), %xmm0
790 pcmpeqb (%edi), %xmm0
792 movdqa 32(%esi), %xmm3
793 palignr $8,16(%esi), %xmm3
794 pcmpeqb 16(%edi), %xmm3
796 L(shr_8_gobble_loop):
802 movdqa 64(%esi), %xmm3
803 palignr $8,48(%esi), %xmm3
805 movdqa 48(%esi), %xmm0
806 palignr $8,32(%esi), %xmm0
807 pcmpeqb 32(%edi), %xmm0
809 pcmpeqb 48(%edi), %xmm3
812 jz L(shr_8_gobble_loop)
815 jge L(shr_8_gobble_next)
818 L(shr_8_gobble_next):
829 lea (%ecx, %edi,1), %eax
830 lea 8(%ecx, %esi,1), %edx
842 movdqa 16(%esi), %xmm1
844 palignr $9,(%esi), %xmm1
845 pcmpeqb (%edi), %xmm1
847 movdqa 32(%esi), %xmm3
848 palignr $9,%xmm2, %xmm3
849 pcmpeqb 16(%edi), %xmm3
857 lea (%ecx, %edi,1), %eax
858 lea 9(%ecx, %esi,1), %edx
866 movdqa 16(%esi), %xmm0
867 palignr $9,(%esi), %xmm0
868 pcmpeqb (%edi), %xmm0
870 movdqa 32(%esi), %xmm3
871 palignr $9,16(%esi), %xmm3
872 pcmpeqb 16(%edi), %xmm3
874 L(shr_9_gobble_loop):
880 movdqa 64(%esi), %xmm3
881 palignr $9,48(%esi), %xmm3
883 movdqa 48(%esi), %xmm0
884 palignr $9,32(%esi), %xmm0
885 pcmpeqb 32(%edi), %xmm0
887 pcmpeqb 48(%edi), %xmm3
890 jz L(shr_9_gobble_loop)
893 jge L(shr_9_gobble_next)
896 L(shr_9_gobble_next):
907 lea (%ecx, %edi,1), %eax
908 lea 9(%ecx, %esi,1), %edx
920 movdqa 16(%esi), %xmm1
922 palignr $10, (%esi), %xmm1
923 pcmpeqb (%edi), %xmm1
925 movdqa 32(%esi), %xmm3
926 palignr $10,%xmm2, %xmm3
927 pcmpeqb 16(%edi), %xmm3
935 lea (%ecx, %edi,1), %eax
936 lea 10(%ecx, %esi,1), %edx
944 movdqa 16(%esi), %xmm0
945 palignr $10, (%esi), %xmm0
946 pcmpeqb (%edi), %xmm0
948 movdqa 32(%esi), %xmm3
949 palignr $10, 16(%esi), %xmm3
950 pcmpeqb 16(%edi), %xmm3
952 L(shr_10_gobble_loop):
958 movdqa 64(%esi), %xmm3
959 palignr $10,48(%esi), %xmm3
961 movdqa 48(%esi), %xmm0
962 palignr $10,32(%esi), %xmm0
963 pcmpeqb 32(%edi), %xmm0
965 pcmpeqb 48(%edi), %xmm3
968 jz L(shr_10_gobble_loop)
971 jge L(shr_10_gobble_next)
974 L(shr_10_gobble_next):
985 lea (%ecx, %edi,1), %eax
986 lea 10(%ecx, %esi,1), %edx
998 movdqa 16(%esi), %xmm1
1000 palignr $11, (%esi), %xmm1
1001 pcmpeqb (%edi), %xmm1
1003 movdqa 32(%esi), %xmm3
1004 palignr $11, %xmm2, %xmm3
1005 pcmpeqb 16(%edi), %xmm3
1008 pmovmskb %xmm3, %edx
1013 lea (%ecx, %edi,1), %eax
1014 lea 11(%ecx, %esi,1), %edx
1022 movdqa 16(%esi), %xmm0
1023 palignr $11, (%esi), %xmm0
1024 pcmpeqb (%edi), %xmm0
1026 movdqa 32(%esi), %xmm3
1027 palignr $11, 16(%esi), %xmm3
1028 pcmpeqb 16(%edi), %xmm3
1030 L(shr_11_gobble_loop):
1033 pmovmskb %xmm3, %edx
1036 movdqa 64(%esi), %xmm3
1037 palignr $11,48(%esi), %xmm3
1039 movdqa 48(%esi), %xmm0
1040 palignr $11,32(%esi), %xmm0
1041 pcmpeqb 32(%edi), %xmm0
1043 pcmpeqb 48(%edi), %xmm3
1046 jz L(shr_11_gobble_loop)
1049 jge L(shr_11_gobble_next)
1052 L(shr_11_gobble_next):
1056 pmovmskb %xmm3, %edx
1063 lea (%ecx, %edi,1), %eax
1064 lea 11(%ecx, %esi,1), %edx
1074 jae L(shr_12_gobble)
1076 movdqa 16(%esi), %xmm1
1078 palignr $12, (%esi), %xmm1
1079 pcmpeqb (%edi), %xmm1
1081 movdqa 32(%esi), %xmm3
1082 palignr $12, %xmm2, %xmm3
1083 pcmpeqb 16(%edi), %xmm3
1086 pmovmskb %xmm3, %edx
1091 lea (%ecx, %edi,1), %eax
1092 lea 12(%ecx, %esi,1), %edx
1100 movdqa 16(%esi), %xmm0
1101 palignr $12, (%esi), %xmm0
1102 pcmpeqb (%edi), %xmm0
1104 movdqa 32(%esi), %xmm3
1105 palignr $12, 16(%esi), %xmm3
1106 pcmpeqb 16(%edi), %xmm3
1108 L(shr_12_gobble_loop):
1111 pmovmskb %xmm3, %edx
1114 movdqa 64(%esi), %xmm3
1115 palignr $12,48(%esi), %xmm3
1117 movdqa 48(%esi), %xmm0
1118 palignr $12,32(%esi), %xmm0
1119 pcmpeqb 32(%edi), %xmm0
1121 pcmpeqb 48(%edi), %xmm3
1124 jz L(shr_12_gobble_loop)
1127 jge L(shr_12_gobble_next)
1130 L(shr_12_gobble_next):
1134 pmovmskb %xmm3, %edx
1141 lea (%ecx, %edi,1), %eax
1142 lea 12(%ecx, %esi,1), %edx
1152 jae L(shr_13_gobble)
1154 movdqa 16(%esi), %xmm1
1156 palignr $13, (%esi), %xmm1
1157 pcmpeqb (%edi), %xmm1
1159 movdqa 32(%esi), %xmm3
1160 palignr $13, %xmm2, %xmm3
1161 pcmpeqb 16(%edi), %xmm3
1164 pmovmskb %xmm3, %edx
1169 lea (%ecx, %edi,1), %eax
1170 lea 13(%ecx, %esi,1), %edx
1178 movdqa 16(%esi), %xmm0
1179 palignr $13, (%esi), %xmm0
1180 pcmpeqb (%edi), %xmm0
1182 movdqa 32(%esi), %xmm3
1183 palignr $13, 16(%esi), %xmm3
1184 pcmpeqb 16(%edi), %xmm3
1186 L(shr_13_gobble_loop):
1189 pmovmskb %xmm3, %edx
1192 movdqa 64(%esi), %xmm3
1193 palignr $13,48(%esi), %xmm3
1195 movdqa 48(%esi), %xmm0
1196 palignr $13,32(%esi), %xmm0
1197 pcmpeqb 32(%edi), %xmm0
1199 pcmpeqb 48(%edi), %xmm3
1202 jz L(shr_13_gobble_loop)
1205 jge L(shr_13_gobble_next)
1208 L(shr_13_gobble_next):
1212 pmovmskb %xmm3, %edx
1219 lea (%ecx, %edi,1), %eax
1220 lea 13(%ecx, %esi,1), %edx
1230 jae L(shr_14_gobble)
1232 movdqa 16(%esi), %xmm1
1234 palignr $14, (%esi), %xmm1
1235 pcmpeqb (%edi), %xmm1
1237 movdqa 32(%esi), %xmm3
1238 palignr $14, %xmm2, %xmm3
1239 pcmpeqb 16(%edi), %xmm3
1242 pmovmskb %xmm3, %edx
1247 lea (%ecx, %edi,1), %eax
1248 lea 14(%ecx, %esi,1), %edx
1256 movdqa 16(%esi), %xmm0
1257 palignr $14, (%esi), %xmm0
1258 pcmpeqb (%edi), %xmm0
1260 movdqa 32(%esi), %xmm3
1261 palignr $14, 16(%esi), %xmm3
1262 pcmpeqb 16(%edi), %xmm3
1264 L(shr_14_gobble_loop):
1267 pmovmskb %xmm3, %edx
1270 movdqa 64(%esi), %xmm3
1271 palignr $14,48(%esi), %xmm3
1273 movdqa 48(%esi), %xmm0
1274 palignr $14,32(%esi), %xmm0
1275 pcmpeqb 32(%edi), %xmm0
1277 pcmpeqb 48(%edi), %xmm3
1280 jz L(shr_14_gobble_loop)
1283 jge L(shr_14_gobble_next)
1286 L(shr_14_gobble_next):
1290 pmovmskb %xmm3, %edx
1297 lea (%ecx, %edi,1), %eax
1298 lea 14(%ecx, %esi,1), %edx
1308 jae L(shr_15_gobble)
1310 movdqa 16(%esi), %xmm1
1312 palignr $15, (%esi), %xmm1
1313 pcmpeqb (%edi), %xmm1
1315 movdqa 32(%esi), %xmm3
1316 palignr $15, %xmm2, %xmm3
1317 pcmpeqb 16(%edi), %xmm3
1320 pmovmskb %xmm3, %edx
1325 lea (%ecx, %edi,1), %eax
1326 lea 15(%ecx, %esi,1), %edx
1334 movdqa 16(%esi), %xmm0
1335 palignr $15, (%esi), %xmm0
1336 pcmpeqb (%edi), %xmm0
1338 movdqa 32(%esi), %xmm3
1339 palignr $15, 16(%esi), %xmm3
1340 pcmpeqb 16(%edi), %xmm3
1342 L(shr_15_gobble_loop):
1345 pmovmskb %xmm3, %edx
1348 movdqa 64(%esi), %xmm3
1349 palignr $15,48(%esi), %xmm3
1351 movdqa 48(%esi), %xmm0
1352 palignr $15,32(%esi), %xmm0
1353 pcmpeqb 32(%edi), %xmm0
1355 pcmpeqb 48(%edi), %xmm3
1358 jz L(shr_15_gobble_loop)
1361 jge L(shr_15_gobble_next)
1364 L(shr_15_gobble_next):
1368 pmovmskb %xmm3, %edx
1375 lea (%ecx, %edi,1), %eax
1376 lea 15(%ecx, %esi,1), %edx
1383 pmovmskb %xmm1, %ebx
1416 movzbl -9(%edi), %eax
1417 movzbl -9(%esi), %edx
1423 movzbl -16(%edi), %eax
1424 movzbl -16(%esi), %edx
1430 movzbl -15(%edi), %eax
1431 movzbl -15(%esi), %edx
1437 movzbl -14(%edi), %eax
1438 movzbl -14(%esi), %edx
1444 movzbl -13(%edi), %eax
1445 movzbl -13(%esi), %edx
1451 movzbl -12(%edi), %eax
1452 movzbl -12(%esi), %edx
1458 movzbl -11(%edi), %eax
1459 movzbl -11(%esi), %edx
1465 movzbl -10(%edi), %eax
1466 movzbl -10(%esi), %edx
1497 movzbl -9(%edi), %eax
1498 movzbl -9(%esi), %edx
1737 movzbl -1(%eax), %ecx
1802 movzwl -2(%eax), %ecx
1803 movzwl -2(%edx), %ebx
1815 movl -47(%eax), %ecx
1816 movl -47(%edx), %ebx
1820 movl -43(%eax), %ecx
1821 movl -43(%edx), %ebx
1825 movl -39(%eax), %ecx
1826 movl -39(%edx), %ebx
1830 movl -35(%eax), %ecx
1831 movl -35(%edx), %ebx
1835 movl -31(%eax), %ecx
1836 movl -31(%edx), %ebx
1840 movl -27(%eax), %ecx
1841 movl -27(%edx), %ebx
1845 movl -23(%eax), %ecx
1846 movl -23(%edx), %ebx
1850 movl -19(%eax), %ecx
1851 movl -19(%edx), %ebx
1855 movl -15(%eax), %ecx
1856 movl -15(%edx), %ebx
1860 movl -11(%eax), %ecx
1861 movl -11(%edx), %ebx
1870 movzwl -3(%eax), %ecx
1871 movzwl -3(%edx), %ebx
1876 movzbl -1(%eax), %eax