1 /* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2 Copyright (C) 2012-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <arm-features.h>
22 /* Implementation of strcmp for ARMv7 when DSP instructions are
23 available. Use ldrd to support wider loads, provided the data
24 is sufficiently aligned. Use saturating arithmetic to optimize
28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29 string. If comparing completely random strings the pre-check will
30 save time, since there is a very high probability of a mismatch in
31 the first character: we save significant overhead if this is the
32 common case. However, if strings are likely to be identical (e.g.
33 because we're verifying a hit in a hash table), then this check
34 is largely redundant. */
36 #define STRCMP_PRECHECK 1
40 #ifdef __ARM_BIG_ENDIAN
44 # define MSB 0x000000ff
45 # define LSB 0xff000000
46 # define BYTE0_OFFSET 24
47 # define BYTE1_OFFSET 16
48 # define BYTE2_OFFSET 8
49 # define BYTE3_OFFSET 0
50 #else /* not __ARM_BIG_ENDIAN */
54 # define BYTE0_OFFSET 0
55 # define BYTE1_OFFSET 8
56 # define BYTE2_OFFSET 16
57 # define BYTE3_OFFSET 24
58 # define MSB 0xff000000
59 # define LSB 0x000000ff
60 #endif /* not __ARM_BIG_ENDIAN */
62 /* Parameters and result. */
65 #define result r0 /* Overlaps src1. */
67 /* Internal variables. */
72 /* Additional internal variables for 64-bit aligned data. */
77 #define syndrome_a tmp1
78 #define syndrome_b tmp2
80 /* Additional internal variables for 32-bit aligned data. */
87 /* This code is best on Thumb. */
90 /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
91 .macro prepare_mask mask_reg, nbits_reg
92 S2HI \mask_reg, const_m1, \nbits_reg
94 .macro apply_mask data_reg, mask_reg
95 orn \data_reg, \data_reg, \mask_reg
98 /* In ARM code we don't have ORN, but we can use MVN with a register shift. */
99 .macro prepare_mask mask_reg, nbits_reg
100 mvn \mask_reg, const_m1, S2HI \nbits_reg
102 .macro apply_mask data_reg, mask_reg
103 orr \data_reg, \data_reg, \mask_reg
106 /* These clobber the condition codes, which the real Thumb cbz/cbnz
107 instructions do not. But it doesn't matter for any of the uses here. */
108 .macro cbz reg, label
112 .macro cbnz reg, label
119 /* Macro to compute and return the result value for word-aligned
121 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
122 #ifdef __ARM_BIG_ENDIAN
123 /* If data1 contains a zero byte, then syndrome will contain a 1 in
124 bit 7 of that byte. Otherwise, the highest set bit in the
125 syndrome will highlight the first different bit. It is therefore
126 sufficient to extract the eight bits starting with the syndrome
131 ldrd r6, r7, [sp, #8]
135 ldrd r4, r5, [sp], #16
137 cfi_def_cfa_offset (0)
142 sub result, result, r1, lsr #24
145 /* To use the big-endian trick we'd have to reverse all three words.
146 that's slower than this approach. */
152 ldrd r6, r7, [sp, #8]
155 and result, \d1, #255
157 ldrd r4, r5, [sp], #16
159 cfi_def_cfa_offset (0)
164 sub result, result, r1
173 #if STRCMP_PRECHECK == 1
180 #if STRCMP_PRECHECK == 1
190 strd r4, r5, [sp, #-16]!
191 cfi_def_cfa_offset (16)
195 strd r6, r7, [sp, #8]
200 cbz r2, .Lloop_aligned8
207 /* Deal with mutual misalignment by aligning downwards and then
208 masking off the unwanted loaded data to prevent a difference. */
213 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
215 ldrd data1a, data1b, [\B], #16
218 ldrd data2a, data2b, [\B], #16
219 prepare_mask tmp1, tmp2
220 apply_mask data1a, tmp1
221 apply_mask data2a, tmp1
222 beq .Lstart_realigned8
223 apply_mask data1b, tmp1
225 apply_mask data2b, tmp1
229 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
231 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
232 .p2align 2 /* Always word aligned. */
235 ldrd data1a, data1b, [\B], #16
237 ldrd data2a, data2b, [\B], #16
239 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
240 eor syndrome_a, data1a, data2a
241 sel syndrome_a, syndrome_a, const_m1
242 cbnz syndrome_a, .Ldiff_in_a
243 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
244 eor syndrome_b, data1b, data2b
245 sel syndrome_b, syndrome_b, const_m1
246 cbnz syndrome_b, .Ldiff_in_b
249 ldrd data1a, data1b, [\B, #-8]
251 ldrd data2a, data2b, [\B, #-8]
252 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
253 eor syndrome_a, data1a, data2a
254 sel syndrome_a, syndrome_a, const_m1
255 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
256 eor syndrome_b, data1b, data2b
257 sel syndrome_b, syndrome_b, const_m1
258 /* Can't use CBZ for backwards branch. */
259 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
263 cbnz syndrome_a, .Ldiff_in_a
266 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
270 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
279 /* Unrolled by a factor of 2, to reduce the number of post-increment
287 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
288 eor syndrome, data1, data2
289 sel syndrome, syndrome, const_m1
290 cbnz syndrome, .Laligned4_done
295 uadd8 syndrome, data1, const_m1
296 eor syndrome, data1, data2
297 sel syndrome, syndrome, const_m1
302 strcmp_epilogue_aligned syndrome, data1, data2, 0
306 /* Deal with mutual misalignment by aligning downwards and then
307 masking off the unwanted loaded data to prevent a difference. */
308 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
316 prepare_mask tmp1, tmp1
317 apply_mask data1, tmp1
318 apply_mask data2, tmp1
332 #if STRCMP_PRECHECK == 0
335 uxtb tmp1, data1, ror #BYTE1_OFFSET
336 subs tmp1, tmp1, data2
337 bne .Lmisaligned_exit
338 cbz data2, .Lmisaligned_exit
343 uxtb tmp1, data1, ror #BYTE2_OFFSET
344 subs tmp1, tmp1, data2
345 bne .Lmisaligned_exit
346 cbz data2, .Lmisaligned_exit
351 uxtb tmp1, data1, ror #BYTE3_OFFSET
352 subs tmp1, tmp1, data2
353 bne .Lmisaligned_exit
355 cbnz data2, .Lsrc1_aligned
356 #else /* STRCMP_PRECHECK */
357 /* If we've done the pre-check, then we don't need to check the
358 first byte again here. */
361 uxtb tmp1, data1, ror #BYTE2_OFFSET
362 subs tmp1, tmp1, data2
363 bne .Lmisaligned_exit
364 cbz data2, .Lmisaligned_exit
369 uxtb tmp1, data1, ror #BYTE3_OFFSET
370 subs tmp1, tmp1, data2
371 bne .Lmisaligned_exit
372 cbnz data2, .Laligned_m1
379 cfi_def_cfa_offset (0)
386 #if STRCMP_PRECHECK == 1
392 /* src1 is word aligned, but src2 has no common alignment
396 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
401 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
402 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
404 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
406 bic tmp1, data1, #MSB
407 uadd8 syndrome, data1, const_m1
408 eors syndrome, tmp1, data2, S2LO #8
409 sel syndrome, syndrome, const_m1
414 eor tmp1, tmp1, data1
415 cmp tmp1, data2, S2HI #24
421 S2LO data2, data2, #8
425 bics syndrome, syndrome, #MSB
426 bne .Lstrcmp_done_equal
428 /* We can only get here if the MSB of data1 contains 0, so
429 fast-path the exit. */
432 ldrd r4, r5, [sp], #16
434 cfi_def_cfa_offset (0)
437 /* R6/7 Not used in this sequence. */
445 S2LO data1, data1, #24
446 and data2, data2, #LSB
449 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
451 and tmp1, data1, const_m1, S2LO #16
452 uadd8 syndrome, data1, const_m1
453 eors syndrome, tmp1, data2, S2LO #16
454 sel syndrome, syndrome, const_m1
459 eor tmp1, tmp1, data1
460 cmp tmp1, data2, S2HI #16
466 S2LO data2, data2, #16
469 ands syndrome, syndrome, const_m1, S2LO #16
470 bne .Lstrcmp_done_equal
474 S2LO data1, data1, #16
475 #ifdef __ARM_BIG_ENDIAN
476 lsl data2, data2, #16
481 S2LO data1, data1, #16
482 and data2, data2, const_m1, S2LO #16
485 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
487 and tmp1, data1, #LSB
488 uadd8 syndrome, data1, const_m1
489 eors syndrome, tmp1, data2, S2LO #24
490 sel syndrome, syndrome, const_m1
495 eor tmp1, tmp1, data1
496 cmp tmp1, data2, S2HI #8
502 S2LO data2, data2, #24
506 bne .Lstrcmp_done_equal
510 S2LO data1, data1, #8
511 bic data2, data2, #MSB
516 ldrd r4, r5, [sp], #16
518 cfi_def_cfa_offset (0)
521 /* R6/7 not used in this sequence. */
528 #ifndef __ARM_BIG_ENDIAN
531 /* Now everything looks big-endian... */
533 uadd8 tmp1, data1, const_m1
534 eor tmp1, data1, data2
535 sel syndrome, tmp1, const_m1
537 lsl data1, data1, tmp1
538 lsl data2, data2, tmp1
539 lsr result, data1, #24
540 ldrd r4, r5, [sp], #16
541 cfi_def_cfa_offset (0)
544 /* R6/7 not used in this sequence. */
547 sub result, result, data2, lsr #24
550 libc_hidden_builtin_def (strcmp)