1 /* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
27 EALIGN (BP_SYM(memcmp), 4, 0)
32 #define rSTR1 r3 /* first string arg */
33 #define rSTR2 r4 /* second string arg */
34 #define rN r5 /* max string length */
35 #define rWORD1 r6 /* current word in s1 */
36 #define rWORD2 r7 /* current word in s2 */
37 #define rWORD3 r8 /* next word in s1 */
38 #define rWORD4 r9 /* next word in s2 */
39 #define rWORD5 r10 /* next word in s1 */
40 #define rWORD6 r11 /* next word in s2 */
41 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
42 #define rWORD7 r30 /* next word in s1 */
43 #define rWORD8 r31 /* next word in s2 */
45 xor rTMP, rSTR2, rSTR1
48 clrlwi. rTMP, rTMP, 30
49 clrlwi rBITDIF, rSTR1, 30
50 cmplwi cr5, rBITDIF, 0
51 beq- cr6, L(zeroLength)
54 /* If less than 8 bytes or not aligned, use the unaligned
56 blt cr1, L(bytealigned)
58 cfi_adjust_cfa_offset(64)
60 cfi_offset(31,(48-64))
62 cfi_offset(30,(44-64))
64 /* At this point we know both strings have the same alignment and the
65 compare length is at least 8 bytes. rBITDIF contains the low order
66 2 bits of rSTR1 and cr5 contains the result of the logical compare
67 of rBITDIF to 0. If rBITDIF == 0 then we are already word
68 aligned and can perform the word aligned loop.
70 Otherwise we know the two strings have the same alignment (but not
71 yet word aligned). So we force the string addresses to the next lower
72 word boundary and special case this first word using shift left to
73 eliminate bits preceeding the first byte. Since we want to join the
74 normal (word aligned) compare loop, starting at the second word,
75 we need to adjust the length (rN) and special case the loop
76 versioning for the first word. This insures that the loop count is
77 correct and the first word (shifted) is in the expected register pair. */
80 clrrwi rSTR1, rSTR1, 2
81 clrrwi rSTR2, rSTR2, 2
85 srwi rTMP, rN, 4 /* Divide by 16 */
86 andi. rBITDIF, rN, 12 /* Get the word remainder */
89 cmplwi cr1, rBITDIF, 8
93 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
100 slw rWORD5, rWORD1, r11
101 slw rWORD6, rWORD2, r11
102 cmplw cr5, rWORD5, rWORD6
104 /* Do something useful in this cycle since we have to branch anyway. */
107 cmplw cr0, rWORD1, rWORD2
112 slw rWORD5, rWORD1, r11
113 slw rWORD6, rWORD2, r11
114 cmplw cr6, rWORD5, rWORD6
116 /* Do something useful in this cycle since we have to branch anyway. */
119 cmplw cr5, rWORD7, rWORD8
121 /* Remainder is 12 */
124 slw rWORD3, rWORD1, r11
125 slw rWORD4, rWORD2, r11
126 cmplw cr1, rWORD3, rWORD4
128 /* Count is a multiple of 16, remainder is 0 */
131 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
132 slw rWORD1, rWORD1, r11
133 slw rWORD2, rWORD2, r11
134 cmplw cr0, rWORD1, rWORD2
137 /* At this point we know both strings are word aligned and the
138 compare length is at least 8 bytes. */
141 andi. rBITDIF, rN, 12 /* Get the word remainder */
142 srwi rTMP, rN, 4 /* Divide by 16 */
143 cmplwi cr1, rBITDIF, 8
153 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
154 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
155 (8-15 byte compare), we want to use only volatile registers. This
156 means we can avoid restoring non-volatile registers since we did not
157 change any on the early exit path. The key here is the non-early
158 exit path only cares about the condition code (cr5), not about which
159 register pair was used. */
162 cmplw cr5, rWORD5, rWORD6
166 cmplw cr0, rWORD1, rWORD2
170 cmplw cr1, rWORD3, rWORD4
171 lwz rWORD5, 12(rSTR1)
172 lwz rWORD6, 12(rSTR2)
173 cmplw cr6, rWORD5, rWORD6
177 lwzu rWORD7, 16(rSTR1)
178 lwzu rWORD8, 16(rSTR2)
180 cmplw cr5, rWORD7, rWORD8
189 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
198 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
201 cmplw cr6, rWORD5, rWORD6
205 cmplw cr5, rWORD7, rWORD8
209 cmplw cr0, rWORD1, rWORD2
210 lwz rWORD3, 12(rSTR1)
211 lwz rWORD4, 12(rSTR2)
212 cmplw cr1, rWORD3, rWORD4
218 /* Again we are on a early exit path (16-23 byte compare), we want to
219 only use volatile registers and avoid restoring non-volatile
225 cmplw cr5, rWORD3, rWORD4
231 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
237 /* Remainder is 12 */
240 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
243 cmplw cr1, rWORD3, rWORD4
247 cmplw cr6, rWORD5, rWORD6
251 cmplw cr5, rWORD7, rWORD8
252 lwz rWORD1, 12(rSTR1)
253 lwz rWORD2, 12(rSTR2)
254 cmplw cr0, rWORD1, rWORD2
260 /* Again we are on a early exit path (24-31 byte compare), we want to
261 only use volatile registers and avoid restoring non-volatile
267 cmplw cr5, rWORD1, rWORD2
273 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
280 /* Count is a multiple of 16, remainder is 0 */
283 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
286 cmplw cr0, rWORD1, rWORD2
290 cmplw cr1, rWORD3, rWORD4
293 cmplw cr6, rWORD5, rWORD6
294 lwzu rWORD7, 12(rSTR1)
295 lwzu rWORD8, 12(rSTR2)
296 cmplw cr5, rWORD7, rWORD8
299 bdz- L(d24) /* Adjust CTR as we start with +4 */
300 /* This is the primary loop */
305 cmplw cr1, rWORD3, rWORD4
310 cmplw cr6, rWORD5, rWORD6
313 lwz rWORD5, 12(rSTR1)
314 lwz rWORD6, 12(rSTR2)
315 cmplw cr5, rWORD7, rWORD8
318 lwzu rWORD7, 16(rSTR1)
319 lwzu rWORD8, 16(rSTR2)
321 cmplw cr0, rWORD1, rWORD2
325 cmplw cr1, rWORD3, rWORD4
327 cmplw cr6, rWORD5, rWORD6
329 cmplw cr5, rWORD7, rWORD8
343 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
345 /* At this point we have a remainder of 1 to 3 bytes to compare. Since
346 we are aligned it is safe to load the whole word, and use
347 shift right to eliminate bits beyond the compare length. */
351 srw rWORD1, rWORD1, rN
352 srw rWORD2, rWORD2, rN
401 cfi_adjust_cfa_offset(-64)
402 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
404 /* We need to prime this loop. This loop is swing modulo scheduled
405 to avoid pipe delays. The dependent instruction latencies (load to
406 compare to conditional branch) is 2 to 3 cycles. In this loop each
407 dispatch group ends in a branch and takes 1 cycle. Effectively
408 the first iteration of the loop only serves to load operands and
409 branches based on compares are delayed until the next loop.
411 So we must precondition some registers and condition codes so that
412 we don't exit the loop early on the first iteration. */
417 cmplw cr0, rWORD1, rWORD2
421 cmplw cr1, rWORD3, rWORD4
422 lbzu rWORD5, 2(rSTR1)
423 lbzu rWORD6, 2(rSTR2)
427 lbzu rWORD1, 1(rSTR1)
428 lbzu rWORD2, 1(rSTR2)
431 cmplw cr6, rWORD5, rWORD6
434 lbzu rWORD3, 1(rSTR1)
435 lbzu rWORD4, 1(rSTR2)
438 cmplw cr0, rWORD1, rWORD2
441 lbzu rWORD5, 1(rSTR1)
442 lbzu rWORD6, 1(rSTR2)
445 cmplw cr1, rWORD3, rWORD4
448 /* We speculatively loading bytes before we have tested the previous
449 bytes. But we must avoid overrunning the length (in the ctr) to
450 prevent these speculative loads from causing a segfault. In this
451 case the loop will exit early (before the all pending bytes are
452 tested. In this case we must complete the pending operations
489 sub rRTN, rWORD5, rWORD6
495 sub rRTN, rWORD3, rWORD4
500 sub rRTN, rWORD1, rWORD2
510 cfi_adjust_cfa_offset(64)
512 /* At this point we know the strings have different alignment and the
513 compare length is at least 8 bytes. rBITDIF contains the low order
514 2 bits of rSTR1 and cr5 contains the result of the logical compare
515 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
516 perform the Wunaligned loop.
518 Otherwise we know that rSTR1 is not aready word aligned yet.
519 So we can force the string addresses to the next lower word
520 boundary and special case this first word using shift left to
521 eliminate bits preceeding the first byte. Since we want to join the
522 normal (Wualigned) compare loop, starting at the second word,
523 we need to adjust the length (rN) and special case the loop
524 versioning for the first W. This insures that the loop count is
525 correct and the first W (shifted) is in the expected resister pair. */
526 #define rSHL r29 /* Unaligned shift left count. */
527 #define rSHR r28 /* Unaligned shift right count. */
528 #define rB r27 /* Left rotation temp for rWORD2. */
529 #define rD r26 /* Left rotation temp for rWORD4. */
530 #define rF r25 /* Left rotation temp for rWORD6. */
531 #define rH r24 /* Left rotation temp for rWORD8. */
532 #define rA r0 /* Right rotation temp for rWORD2. */
533 #define rC r12 /* Right rotation temp for rWORD4. */
534 #define rE r0 /* Right rotation temp for rWORD6. */
535 #define rG r12 /* Right rotation temp for rWORD8. */
538 cfi_offset(r29,(40-64))
539 clrlwi rSHL, rSTR2, 30
541 cfi_offset(r28,(36-64))
542 beq cr5, L(Wunaligned)
544 cfi_offset(r27,(32-64))
545 /* Adjust the logical start of rSTR2 to compensate for the extra bits
546 in the 1st rSTR1 W. */
547 sub r27, rSTR2, rBITDIF
548 /* But do not attempt to address the W before that W that contains
549 the actual start of rSTR2. */
550 clrrwi rSTR2, rSTR2, 2
552 cfi_offset(r26,(28-64))
553 /* Compute the left/right shift counts for the unalign rSTR2,
554 compensating for the logical (W aligned) start of rSTR1. */
556 clrrwi rSTR1, rSTR1, 2
558 cfi_offset(r25,(24-64))
560 cmplw cr5, r27, rSTR2
564 cfi_offset(r24,(20-64))
565 subfic rSHR, rSHL, 32
566 srwi rTMP, rN, 4 /* Divide by 16 */
567 andi. rBITDIF, rN, 12 /* Get the W remainder */
568 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
569 this special case those bits may be discarded anyway. Also we
570 must avoid loading a W where none of the bits are part of rSTR2 as
571 this may cross a page boundary and cause a page fault. */
576 slw rWORD8, rWORD8, rSHL
581 cmplwi cr1, rBITDIF, 8
586 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
587 or rWORD8, rG, rWORD8
595 slw rWORD7, rWORD1, r11
596 slw rWORD8, rWORD8, r11
598 /* At this point we exit early with the first word compare
599 complete and remainder of 0 to 3 bytes. See L(du14) for details on
600 how we handle the remaining bytes. */
601 cmplw cr5, rWORD7, rWORD8
615 slw rWORD5, rWORD1, r11
616 slw rWORD6, rWORD8, r11
618 /* Remainder is 12 */
622 slw rWORD3, rWORD1, r11
623 slw rWORD4, rWORD8, r11
625 /* Count is a multiple of 16, remainder is 0 */
628 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
629 or rWORD8, rG, rWORD8
631 slw rWORD1, rWORD1, r11
632 slw rWORD2, rWORD8, r11
635 /* At this point we know rSTR1 is word aligned and the
636 compare length is at least 8 bytes. */
640 cfi_offset(r27,(32-64))
641 clrrwi rSTR2, rSTR2, 2
643 cfi_offset(r26,(28-64))
644 srwi rTMP, rN, 4 /* Divide by 16 */
646 cfi_offset(r25,(24-64))
647 andi. rBITDIF, rN, 12 /* Get the W remainder */
649 cfi_offset(r24,(24-64))
652 lwzu rWORD8, 4(rSTR2)
653 cmplwi cr1, rBITDIF, 8
656 subfic rSHR, rSHL, 32
659 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
674 cmplw cr5, rWORD7, rWORD8
680 cmplw cr0, rWORD1, rWORD2
685 lwz rWORD5, 12(rSTR1)
686 lwz rWORD6, 12(rSTR2)
687 cmplw cr1, rWORD3, rWORD4
692 cmplw cr6, rWORD5, rWORD6
695 /* At this point we exit early with the first word compare
696 complete and remainder of 0 to 3 bytes. See L(du14) for details on
697 how we handle the remaining bytes. */
699 cmplw cr5, rWORD7, rWORD8
719 cmplw cr6, rWORD5, rWORD6
726 cmplw cr5, rWORD7, rWORD8
731 lwz rWORD3, 12(rSTR1)
732 lwz rWORD4, 12(rSTR2)
733 cmplw cr0, rWORD1, rWORD2
740 cmplw cr1, rWORD3, rWORD4
744 cmplw cr5, rWORD7, rWORD8
758 /* Remainder is 12 */
768 cmplw cr1, rWORD3, rWORD4
774 cmplw cr6, rWORD5, rWORD6
780 lwz rWORD1, 12(rSTR1)
781 lwz rWORD2, 12(rSTR2)
782 cmplw cr5, rWORD7, rWORD8
789 cmplw cr0, rWORD1, rWORD2
796 cmplw cr5, rWORD7, rWORD8
808 /* Count is a multiple of 16, remainder is 0 */
811 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
819 cmplw cr0, rWORD1, rWORD2
825 cmplw cr1, rWORD3, rWORD4
830 lwzu rWORD7, 12(rSTR1)
831 lwzu rWORD8, 12(rSTR2)
832 cmplw cr6, rWORD5, rWORD6
837 cmplw cr5, rWORD7, rWORD8
838 bdz- L(du24) /* Adjust CTR as we start with +4 */
839 /* This is the primary loop */
844 cmplw cr1, rWORD3, rWORD4
852 cmplw cr6, rWORD5, rWORD6
858 lwz rWORD5, 12(rSTR1)
859 lwz rWORD6, 12(rSTR2)
860 cmplw cr5, rWORD7, rWORD8
866 lwzu rWORD7, 16(rSTR1)
867 lwzu rWORD8, 16(rSTR2)
868 cmplw cr0, rWORD1, rWORD2
877 cmplw cr1, rWORD3, rWORD4
879 cmplw cr6, rWORD5, rWORD6
881 cmplw cr5, rWORD7, rWORD8
891 /* At this point we have a remainder of 1 to 3 bytes to compare. We use
892 shift right to eliminate bits beyond the compare length.
894 However it may not be safe to load rWORD2 which may be beyond the
895 string length. So we compare the bit length of the remainder to
896 the right shift count (rSHR). If the bit count is less than or equal
897 we do not need to load rWORD2 (all significant bits are already in
909 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
913 srw rWORD1, rWORD1, rN
914 srw rWORD2, rWORD2, rN
929 bgt cr0, L(dureturn29)
939 bgt cr1, L(dureturn29)
949 bgt cr6, L(dureturn29)
959 bgt cr5, L(dureturn29)
983 END (BP_SYM (memcmp))
985 libc_hidden_builtin_def (memcmp)
986 weak_alias (memcmp, bcmp)