1 /* memcmp with SSE4.2, wmemcmp with SSE4.2
2 Copyright (C) 2010-2012 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
25 # define MEMCMP __memcmp_sse4_2
28 # define CFI_PUSH(REG) \
29 cfi_adjust_cfa_offset (4); \
30 cfi_rel_offset (REG, 0)
32 # define CFI_POP(REG) \
33 cfi_adjust_cfa_offset (-4); \
36 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
37 # define POP(REG) popl REG; CFI_POP (REG)
41 # define BLK2 BLK1 + 4
43 # define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
47 # define JMPTBL(I, B) I - B
49 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
50 jump table with relative offsets. INDEX is a register contains the
51 index into the jump table. SCALE is the scale of INDEX. */
53 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
54 /* We first load PC into EBX. */ \
56 /* Get the address of the jump table. */ \
57 addl $(TABLE - .), %ebx; \
58 /* Get the entry and convert the relative offset to the \
59 absolute address. */ \
60 addl (%ebx,INDEX,SCALE), %ebx; \
61 /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
64 # define JMPTBL(I, B) I
66 /* Load an entry in a jump table into EBX and branch to it. TABLE is a
67 jump table with relative offsets. INDEX is a register contains the
68 index into the jump table. SCALE is the scale of INDEX. */
69 # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
70 jmp *TABLE(,INDEX,SCALE)
75 wmemcmp has to use SIGNED comparison for elements.
76 memcmp has to use UNSIGNED comparison for elemnts.
79 .section .text.sse4.2,"ax",@progbits
85 # ifdef USE_AS_WMEMCMP
99 # ifndef USE_AS_WMEMCMP
109 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
111 # ifndef USE_AS_WMEMCMP
173 # ifdef USE_AS_WMEMCMP
175 /* for wmemcmp, case N == 1 */
183 jg L(find_diff_bigger)
197 # ifndef USE_AS_WMEMCMP
217 L(64bytesormore_loop):
224 movdqu 16(%eax), %xmm1
225 movdqu 16(%edx), %xmm2
230 movdqu 32(%eax), %xmm1
231 movdqu 32(%edx), %xmm2
236 movdqu 48(%eax), %xmm1
237 movdqu 48(%edx), %xmm2
244 jae L(64bytesormore_loop)
248 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
250 # ifdef USE_AS_WMEMCMP
252 /* Label needs only for table_64bytes filling */
268 # ifndef USE_AS_WMEMCMP
314 # ifndef USE_AS_WMEMCMP
317 movdqu -49(%eax), %xmm1
318 movdqu -49(%edx), %xmm2
324 movdqu -33(%eax), %xmm1
325 movdqu -33(%edx), %xmm2
350 movzbl -1(%eax), %ecx
359 movdqu -50(%eax), %xmm1
360 movdqu -50(%edx), %xmm2
366 movdqu -34(%eax), %xmm1
367 movdqu -34(%edx), %xmm2
392 movzwl -2(%eax), %ecx
393 movzwl -2(%edx), %ebx
404 movdqu -51(%eax), %xmm1
405 movdqu -51(%edx), %xmm2
411 movdqu -35(%eax), %xmm1
412 movdqu -35(%edx), %xmm2
437 movzwl -3(%eax), %ecx
438 movzwl -3(%edx), %ebx
444 movzbl -1(%eax), %eax
452 movdqu -52(%eax), %xmm1
453 movdqu -52(%edx), %xmm2
459 movdqu -36(%eax), %xmm1
460 movdqu -36(%edx), %xmm2
466 movdqu -20(%eax), %xmm1
467 movdqu -20(%edx), %xmm2
473 # ifndef USE_AS_WMEMCMP
483 # ifndef USE_AS_WMEMCMP
486 movdqu -53(%eax), %xmm1
487 movdqu -53(%edx), %xmm2
494 movdqu -37(%eax), %xmm1
495 movdqu -37(%edx), %xmm2
501 movdqu -21(%eax), %xmm1
502 movdqu -21(%edx), %xmm2
510 movzbl -1(%eax), %ecx
518 movdqu -54(%eax), %xmm1
519 movdqu -54(%edx), %xmm2
526 movdqu -38(%eax), %xmm1
527 movdqu -38(%edx), %xmm2
533 movdqu -22(%eax), %xmm1
534 movdqu -22(%edx), %xmm2
543 movzwl -2(%eax), %ecx
544 movzwl -2(%edx), %ebx
554 movdqu -55(%eax), %xmm1
555 movdqu -55(%edx), %xmm2
562 movdqu -39(%eax), %xmm1
563 movdqu -39(%edx), %xmm2
569 movdqu -23(%eax), %xmm1
570 movdqu -23(%edx), %xmm2
578 movzwl -3(%eax), %ecx
579 movzwl -3(%edx), %ebx
584 movzbl -1(%eax), %eax
592 movdqu -56(%eax), %xmm1
593 movdqu -56(%edx), %xmm2
600 movdqu -40(%eax), %xmm1
601 movdqu -40(%edx), %xmm2
607 movdqu -24(%eax), %xmm1
608 movdqu -24(%edx), %xmm2
614 # ifndef USE_AS_WMEMCMP
623 # ifndef USE_AS_WMEMCMP
633 # ifndef USE_AS_WMEMCMP
636 movdqu -57(%eax), %xmm1
637 movdqu -57(%edx), %xmm2
644 movdqu -41(%eax), %xmm1
645 movdqu -41(%edx), %xmm2
651 movdqu -25(%eax), %xmm1
652 movdqu -25(%edx), %xmm2
664 movzbl -1(%eax), %ecx
672 movdqu -58(%eax), %xmm1
673 movdqu -58(%edx), %xmm2
680 movdqu -42(%eax), %xmm1
681 movdqu -42(%edx), %xmm2
687 movdqu -26(%eax), %xmm1
688 movdqu -26(%edx), %xmm2
703 movzwl -2(%eax), %ecx
704 movzwl -2(%edx), %ebx
714 movdqu -59(%eax), %xmm1
715 movdqu -59(%edx), %xmm2
722 movdqu -43(%eax), %xmm1
723 movdqu -43(%edx), %xmm2
729 movdqu -27(%eax), %xmm1
730 movdqu -27(%edx), %xmm2
742 movzwl -3(%eax), %ecx
743 movzwl -3(%edx), %ebx
748 movzbl -1(%eax), %eax
756 movdqu -60(%eax), %xmm1
757 movdqu -60(%edx), %xmm2
764 movdqu -44(%eax), %xmm1
765 movdqu -44(%edx), %xmm2
771 movdqu -28(%eax), %xmm1
772 movdqu -28(%edx), %xmm2
778 # ifndef USE_AS_WMEMCMP
787 # ifndef USE_AS_WMEMCMP
796 # ifndef USE_AS_WMEMCMP
806 # ifndef USE_AS_WMEMCMP
809 movdqu -61(%eax), %xmm1
810 movdqu -61(%edx), %xmm2
817 movdqu -45(%eax), %xmm1
818 movdqu -45(%edx), %xmm2
824 movdqu -29(%eax), %xmm1
825 movdqu -29(%edx), %xmm2
844 movzbl -1(%eax), %ecx
852 movdqu -62(%eax), %xmm1
853 movdqu -62(%edx), %xmm2
860 movdqu -46(%eax), %xmm1
861 movdqu -46(%edx), %xmm2
867 movdqu -30(%eax), %xmm1
868 movdqu -30(%edx), %xmm2
884 movzwl -2(%eax), %ecx
885 movzwl -2(%edx), %ebx
895 movdqu -63(%eax), %xmm1
896 movdqu -63(%edx), %xmm2
903 movdqu -47(%eax), %xmm1
904 movdqu -47(%edx), %xmm2
910 movdqu -31(%eax), %xmm1
911 movdqu -31(%edx), %xmm2
928 movzwl -3(%eax), %ecx
929 movzwl -3(%edx), %ebx
934 movzbl -1(%eax), %eax
943 movdqu -64(%eax), %xmm1
944 movdqu -64(%edx), %xmm2
950 movdqu -48(%eax), %xmm1
951 movdqu -48(%edx), %xmm2
957 movdqu -32(%eax), %xmm1
958 movdqu -32(%edx), %xmm2
965 # ifndef USE_AS_WMEMCMP
974 # ifndef USE_AS_WMEMCMP
983 # ifndef USE_AS_WMEMCMP
992 # ifndef USE_AS_WMEMCMP
1002 # ifndef USE_AS_WMEMCMP
1057 # ifndef USE_AS_WMEMCMP
1087 .section .rodata.sse4.2,"a",@progbits
1089 .type L(table_64bytes), @object
1090 # ifndef USE_AS_WMEMCMP
1092 .int JMPTBL (L(0bytes), L(table_64bytes))
1093 .int JMPTBL (L(1bytes), L(table_64bytes))
1094 .int JMPTBL (L(2bytes), L(table_64bytes))
1095 .int JMPTBL (L(3bytes), L(table_64bytes))
1096 .int JMPTBL (L(4bytes), L(table_64bytes))
1097 .int JMPTBL (L(5bytes), L(table_64bytes))
1098 .int JMPTBL (L(6bytes), L(table_64bytes))
1099 .int JMPTBL (L(7bytes), L(table_64bytes))
1100 .int JMPTBL (L(8bytes), L(table_64bytes))
1101 .int JMPTBL (L(9bytes), L(table_64bytes))
1102 .int JMPTBL (L(10bytes), L(table_64bytes))
1103 .int JMPTBL (L(11bytes), L(table_64bytes))
1104 .int JMPTBL (L(12bytes), L(table_64bytes))
1105 .int JMPTBL (L(13bytes), L(table_64bytes))
1106 .int JMPTBL (L(14bytes), L(table_64bytes))
1107 .int JMPTBL (L(15bytes), L(table_64bytes))
1108 .int JMPTBL (L(16bytes), L(table_64bytes))
1109 .int JMPTBL (L(17bytes), L(table_64bytes))
1110 .int JMPTBL (L(18bytes), L(table_64bytes))
1111 .int JMPTBL (L(19bytes), L(table_64bytes))
1112 .int JMPTBL (L(20bytes), L(table_64bytes))
1113 .int JMPTBL (L(21bytes), L(table_64bytes))
1114 .int JMPTBL (L(22bytes), L(table_64bytes))
1115 .int JMPTBL (L(23bytes), L(table_64bytes))
1116 .int JMPTBL (L(24bytes), L(table_64bytes))
1117 .int JMPTBL (L(25bytes), L(table_64bytes))
1118 .int JMPTBL (L(26bytes), L(table_64bytes))
1119 .int JMPTBL (L(27bytes), L(table_64bytes))
1120 .int JMPTBL (L(28bytes), L(table_64bytes))
1121 .int JMPTBL (L(29bytes), L(table_64bytes))
1122 .int JMPTBL (L(30bytes), L(table_64bytes))
1123 .int JMPTBL (L(31bytes), L(table_64bytes))
1124 .int JMPTBL (L(32bytes), L(table_64bytes))
1125 .int JMPTBL (L(33bytes), L(table_64bytes))
1126 .int JMPTBL (L(34bytes), L(table_64bytes))
1127 .int JMPTBL (L(35bytes), L(table_64bytes))
1128 .int JMPTBL (L(36bytes), L(table_64bytes))
1129 .int JMPTBL (L(37bytes), L(table_64bytes))
1130 .int JMPTBL (L(38bytes), L(table_64bytes))
1131 .int JMPTBL (L(39bytes), L(table_64bytes))
1132 .int JMPTBL (L(40bytes), L(table_64bytes))
1133 .int JMPTBL (L(41bytes), L(table_64bytes))
1134 .int JMPTBL (L(42bytes), L(table_64bytes))
1135 .int JMPTBL (L(43bytes), L(table_64bytes))
1136 .int JMPTBL (L(44bytes), L(table_64bytes))
1137 .int JMPTBL (L(45bytes), L(table_64bytes))
1138 .int JMPTBL (L(46bytes), L(table_64bytes))
1139 .int JMPTBL (L(47bytes), L(table_64bytes))
1140 .int JMPTBL (L(48bytes), L(table_64bytes))
1141 .int JMPTBL (L(49bytes), L(table_64bytes))
1142 .int JMPTBL (L(50bytes), L(table_64bytes))
1143 .int JMPTBL (L(51bytes), L(table_64bytes))
1144 .int JMPTBL (L(52bytes), L(table_64bytes))
1145 .int JMPTBL (L(53bytes), L(table_64bytes))
1146 .int JMPTBL (L(54bytes), L(table_64bytes))
1147 .int JMPTBL (L(55bytes), L(table_64bytes))
1148 .int JMPTBL (L(56bytes), L(table_64bytes))
1149 .int JMPTBL (L(57bytes), L(table_64bytes))
1150 .int JMPTBL (L(58bytes), L(table_64bytes))
1151 .int JMPTBL (L(59bytes), L(table_64bytes))
1152 .int JMPTBL (L(60bytes), L(table_64bytes))
1153 .int JMPTBL (L(61bytes), L(table_64bytes))
1154 .int JMPTBL (L(62bytes), L(table_64bytes))
1155 .int JMPTBL (L(63bytes), L(table_64bytes))
1156 .int JMPTBL (L(64bytes), L(table_64bytes))
1159 .int JMPTBL (L(0bytes), L(table_64bytes))
1160 .int JMPTBL (L(unreal_case), L(table_64bytes))
1161 .int JMPTBL (L(unreal_case), L(table_64bytes))
1162 .int JMPTBL (L(unreal_case), L(table_64bytes))
1163 .int JMPTBL (L(4bytes), L(table_64bytes))
1164 .int JMPTBL (L(unreal_case), L(table_64bytes))
1165 .int JMPTBL (L(unreal_case), L(table_64bytes))
1166 .int JMPTBL (L(unreal_case), L(table_64bytes))
1167 .int JMPTBL (L(8bytes), L(table_64bytes))
1168 .int JMPTBL (L(unreal_case), L(table_64bytes))
1169 .int JMPTBL (L(unreal_case), L(table_64bytes))
1170 .int JMPTBL (L(unreal_case), L(table_64bytes))
1171 .int JMPTBL (L(12bytes), L(table_64bytes))
1172 .int JMPTBL (L(unreal_case), L(table_64bytes))
1173 .int JMPTBL (L(unreal_case), L(table_64bytes))
1174 .int JMPTBL (L(unreal_case), L(table_64bytes))
1175 .int JMPTBL (L(16bytes), L(table_64bytes))
1176 .int JMPTBL (L(unreal_case), L(table_64bytes))
1177 .int JMPTBL (L(unreal_case), L(table_64bytes))
1178 .int JMPTBL (L(unreal_case), L(table_64bytes))
1179 .int JMPTBL (L(20bytes), L(table_64bytes))
1180 .int JMPTBL (L(unreal_case), L(table_64bytes))
1181 .int JMPTBL (L(unreal_case), L(table_64bytes))
1182 .int JMPTBL (L(unreal_case), L(table_64bytes))
1183 .int JMPTBL (L(24bytes), L(table_64bytes))
1184 .int JMPTBL (L(unreal_case), L(table_64bytes))
1185 .int JMPTBL (L(unreal_case), L(table_64bytes))
1186 .int JMPTBL (L(unreal_case), L(table_64bytes))
1187 .int JMPTBL (L(28bytes), L(table_64bytes))
1188 .int JMPTBL (L(unreal_case), L(table_64bytes))
1189 .int JMPTBL (L(unreal_case), L(table_64bytes))
1190 .int JMPTBL (L(unreal_case), L(table_64bytes))
1191 .int JMPTBL (L(32bytes), L(table_64bytes))
1192 .int JMPTBL (L(unreal_case), L(table_64bytes))
1193 .int JMPTBL (L(unreal_case), L(table_64bytes))
1194 .int JMPTBL (L(unreal_case), L(table_64bytes))
1195 .int JMPTBL (L(36bytes), L(table_64bytes))
1196 .int JMPTBL (L(unreal_case), L(table_64bytes))
1197 .int JMPTBL (L(unreal_case), L(table_64bytes))
1198 .int JMPTBL (L(unreal_case), L(table_64bytes))
1199 .int JMPTBL (L(40bytes), L(table_64bytes))
1200 .int JMPTBL (L(unreal_case), L(table_64bytes))
1201 .int JMPTBL (L(unreal_case), L(table_64bytes))
1202 .int JMPTBL (L(unreal_case), L(table_64bytes))
1203 .int JMPTBL (L(44bytes), L(table_64bytes))
1204 .int JMPTBL (L(unreal_case), L(table_64bytes))
1205 .int JMPTBL (L(unreal_case), L(table_64bytes))
1206 .int JMPTBL (L(unreal_case), L(table_64bytes))
1207 .int JMPTBL (L(48bytes), L(table_64bytes))
1208 .int JMPTBL (L(unreal_case), L(table_64bytes))
1209 .int JMPTBL (L(unreal_case), L(table_64bytes))
1210 .int JMPTBL (L(unreal_case), L(table_64bytes))
1211 .int JMPTBL (L(52bytes), L(table_64bytes))
1212 .int JMPTBL (L(unreal_case), L(table_64bytes))
1213 .int JMPTBL (L(unreal_case), L(table_64bytes))
1214 .int JMPTBL (L(unreal_case), L(table_64bytes))
1215 .int JMPTBL (L(56bytes), L(table_64bytes))
1216 .int JMPTBL (L(unreal_case), L(table_64bytes))
1217 .int JMPTBL (L(unreal_case), L(table_64bytes))
1218 .int JMPTBL (L(unreal_case), L(table_64bytes))
1219 .int JMPTBL (L(60bytes), L(table_64bytes))
1220 .int JMPTBL (L(unreal_case), L(table_64bytes))
1221 .int JMPTBL (L(unreal_case), L(table_64bytes))
1222 .int JMPTBL (L(unreal_case), L(table_64bytes))
1223 .int JMPTBL (L(64bytes), L(table_64bytes))