1 /* strcmp implementation for ARMv7-A, optimized for Cortex-A15.
2 Copyright (C) 2012-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <arm-features.h>
22 /* Implementation of strcmp for ARMv7 when DSP instructions are
23 available. Use ldrd to support wider loads, provided the data
24 is sufficiently aligned. Use saturating arithmetic to optimize
28 STRCMP_PRECHECK: Run a quick pre-check of the first byte in the
29 string. If comparing completely random strings the pre-check will
30 save time, since there is a very high probability of a mismatch in
31 the first character: we save significant overhead if this is the
32 common case. However, if strings are likely to be identical (e.g.
33 because we're verifying a hit in a hash table), then this check
34 is largely redundant. */
36 #define STRCMP_PRECHECK 1
40 #ifdef __ARM_BIG_ENDIAN
44 # define MSB 0x000000ff
45 # define LSB 0xff000000
46 # define BYTE0_OFFSET 24
47 # define BYTE1_OFFSET 16
48 # define BYTE2_OFFSET 8
49 # define BYTE3_OFFSET 0
50 #else /* not __ARM_BIG_ENDIAN */
54 # define BYTE0_OFFSET 0
55 # define BYTE1_OFFSET 8
56 # define BYTE2_OFFSET 16
57 # define BYTE3_OFFSET 24
58 # define MSB 0xff000000
59 # define LSB 0x000000ff
60 #endif /* not __ARM_BIG_ENDIAN */
62 /* Parameters and result. */
65 #define result r0 /* Overlaps src1. */
67 /* Internal variables. */
72 /* Additional internal variables for 64-bit aligned data. */
77 #define syndrome_a tmp1
78 #define syndrome_b tmp2
80 /* Additional internal variables for 32-bit aligned data. */
87 /* This code is best on Thumb. */
90 /* In Thumb code we can't use MVN with a register shift, but we do have ORN. */
91 .macro prepare_mask mask_reg, nbits_reg
92 S2HI \mask_reg, const_m1, \nbits_reg
94 .macro apply_mask data_reg, mask_reg
95 orn \data_reg, \data_reg, \mask_reg
98 /* In ARM code we don't have ORN, but we can use MVN with a register shift. */
99 .macro prepare_mask mask_reg, nbits_reg
100 mvn \mask_reg, const_m1, S2HI \nbits_reg
102 .macro apply_mask data_reg, mask_reg
103 orr \data_reg, \data_reg, \mask_reg
106 /* These clobber the condition codes, which the real Thumb cbz/cbnz
107 instructions do not. But it doesn't matter for any of the uses here. */
108 .macro cbz reg, label
112 .macro cbnz reg, label
119 /* Macro to compute and return the result value for word-aligned
121 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6
122 #ifdef __ARM_BIG_ENDIAN
123 /* If data1 contains a zero byte, then syndrome will contain a 1 in
124 bit 7 of that byte. Otherwise, the highest set bit in the
125 syndrome will highlight the first different bit. It is therefore
126 sufficient to extract the eight bits starting with the syndrome
131 ldrd r6, r7, [sp, #8]
135 ldrd r4, r5, [sp], #16
137 cfi_def_cfa_offset (0)
142 sub result, result, r1, lsr #24
145 /* To use the big-endian trick we'd have to reverse all three words.
146 that's slower than this approach. */
152 ldrd r6, r7, [sp, #8]
155 and result, \d1, #255
157 ldrd r4, r5, [sp], #16
159 cfi_def_cfa_offset (0)
164 sub result, result, r1
173 #if STRCMP_PRECHECK == 1
180 #if STRCMP_PRECHECK == 1
188 strd r4, r5, [sp, #-16]!
189 cfi_def_cfa_offset (16)
193 strd r6, r7, [sp, #8]
198 cbz r2, .Lloop_aligned8
205 /* Deal with mutual misalignment by aligning downwards and then
206 masking off the unwanted loaded data to prevent a difference. */
211 lsl tmp2, tmp2, #3 /* Bytes -> bits. */
212 ldrd data1a, data1b, [src1], #16
214 ldrd data2a, data2b, [src2], #16
215 prepare_mask tmp1, tmp2
216 apply_mask data1a, tmp1
217 apply_mask data2a, tmp1
218 beq .Lstart_realigned8
219 apply_mask data1b, tmp1
221 apply_mask data2b, tmp1
225 /* Unwind the inner loop by a factor of 2, giving 16 bytes per
227 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */
228 .p2align 2 /* Always word aligned. */
230 ldrd data1a, data1b, [src1], #16
231 ldrd data2a, data2b, [src2], #16
233 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
234 eor syndrome_a, data1a, data2a
235 sel syndrome_a, syndrome_a, const_m1
236 cbnz syndrome_a, .Ldiff_in_a
237 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
238 eor syndrome_b, data1b, data2b
239 sel syndrome_b, syndrome_b, const_m1
240 cbnz syndrome_b, .Ldiff_in_b
242 ldrd data1a, data1b, [src1, #-8]
243 ldrd data2a, data2b, [src2, #-8]
244 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */
245 eor syndrome_a, data1a, data2a
246 sel syndrome_a, syndrome_a, const_m1
247 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */
248 eor syndrome_b, data1b, data2b
249 sel syndrome_b, syndrome_b, const_m1
250 /* Can't use CBZ for backwards branch. */
251 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
255 cbnz syndrome_a, .Ldiff_in_a
258 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
262 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
271 /* Unrolled by a factor of 2, to reduce the number of post-increment
274 ldr data1, [src1], #8
275 ldr data2, [src2], #8
277 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */
278 eor syndrome, data1, data2
279 sel syndrome, syndrome, const_m1
280 cbnz syndrome, .Laligned4_done
281 ldr data1, [src1, #-4]
282 ldr data2, [src2, #-4]
283 uadd8 syndrome, data1, const_m1
284 eor syndrome, data1, data2
285 sel syndrome, syndrome, const_m1
290 strcmp_epilogue_aligned syndrome, data1, data2, 0
294 /* Deal with mutual misalignment by aligning downwards and then
295 masking off the unwanted loaded data to prevent a difference. */
296 lsl tmp1, tmp1, #3 /* Bytes -> bits. */
298 ldr data1, [src1], #8
300 ldr data2, [src2], #8
302 prepare_mask tmp1, tmp1
303 apply_mask data1, tmp1
304 apply_mask data2, tmp1
313 ldr data1, [src1], #4
317 #if STRCMP_PRECHECK == 0
318 ldrb data2, [src2, #1]
319 uxtb tmp1, data1, ror #BYTE1_OFFSET
320 subs tmp1, tmp1, data2
321 bne .Lmisaligned_exit
322 cbz data2, .Lmisaligned_exit
325 ldrb data2, [src2, #2]
326 uxtb tmp1, data1, ror #BYTE2_OFFSET
327 subs tmp1, tmp1, data2
328 bne .Lmisaligned_exit
329 cbz data2, .Lmisaligned_exit
332 ldrb data2, [src2, #3]
333 uxtb tmp1, data1, ror #BYTE3_OFFSET
334 subs tmp1, tmp1, data2
335 bne .Lmisaligned_exit
337 cbnz data2, .Lsrc1_aligned
338 #else /* STRCMP_PRECHECK */
339 /* If we've done the pre-check, then we don't need to check the
340 first byte again here. */
341 ldrb data2, [src2, #2]
342 uxtb tmp1, data1, ror #BYTE2_OFFSET
343 subs tmp1, tmp1, data2
344 bne .Lmisaligned_exit
345 cbz data2, .Lmisaligned_exit
348 ldrb data2, [src2, #3]
349 uxtb tmp1, data1, ror #BYTE3_OFFSET
350 subs tmp1, tmp1, data2
351 bne .Lmisaligned_exit
352 cbnz data2, .Laligned_m1
359 cfi_def_cfa_offset (0)
366 #if STRCMP_PRECHECK == 1
372 /* src1 is word aligned, but src2 has no common alignment
374 ldr data1, [src1], #4
375 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */
378 ldr data2, [src2], #4
379 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */
380 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */
382 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */
384 bic tmp1, data1, #MSB
385 uadd8 syndrome, data1, const_m1
386 eors syndrome, tmp1, data2, S2LO #8
387 sel syndrome, syndrome, const_m1
390 ldr data2, [src2], #4
391 eor tmp1, tmp1, data1
392 cmp tmp1, data2, S2HI #24
394 ldr data1, [src1], #4
397 S2LO data2, data2, #8
401 bics syndrome, syndrome, #MSB
402 bne .Lstrcmp_done_equal
404 /* We can only get here if the MSB of data1 contains 0, so
405 fast-path the exit. */
407 ldrd r4, r5, [sp], #16
409 cfi_def_cfa_offset (0)
412 /* R6/7 Not used in this sequence. */
420 S2LO data1, data1, #24
421 and data2, data2, #LSB
424 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
426 and tmp1, data1, const_m1, S2LO #16
427 uadd8 syndrome, data1, const_m1
428 eors syndrome, tmp1, data2, S2LO #16
429 sel syndrome, syndrome, const_m1
432 ldr data2, [src2], #4
433 eor tmp1, tmp1, data1
434 cmp tmp1, data2, S2HI #16
436 ldr data1, [src1], #4
439 S2LO data2, data2, #16
442 ands syndrome, syndrome, const_m1, S2LO #16
443 bne .Lstrcmp_done_equal
446 S2LO data1, data1, #16
447 #ifdef __ARM_BIG_ENDIAN
448 lsl data2, data2, #16
453 S2LO data1, data1, #16
454 and data2, data2, const_m1, S2LO #16
457 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */
459 and tmp1, data1, #LSB
460 uadd8 syndrome, data1, const_m1
461 eors syndrome, tmp1, data2, S2LO #24
462 sel syndrome, syndrome, const_m1
465 ldr data2, [src2], #4
466 eor tmp1, tmp1, data1
467 cmp tmp1, data2, S2HI #8
469 ldr data1, [src1], #4
472 S2LO data2, data2, #24
476 bne .Lstrcmp_done_equal
479 S2LO data1, data1, #8
480 bic data2, data2, #MSB
485 ldrd r4, r5, [sp], #16
487 cfi_def_cfa_offset (0)
490 /* R6/7 not used in this sequence. */
497 #ifndef __ARM_BIG_ENDIAN
500 /* Now everything looks big-endian... */
502 uadd8 tmp1, data1, const_m1
503 eor tmp1, data1, data2
504 sel syndrome, tmp1, const_m1
506 lsl data1, data1, tmp1
507 lsl data2, data2, tmp1
508 lsr result, data1, #24
509 ldrd r4, r5, [sp], #16
510 cfi_def_cfa_offset (0)
513 /* R6/7 not used in this sequence. */
516 sub result, result, data2, lsr #24
519 libc_hidden_builtin_def (strcmp)