1 /* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
26 EALIGN (BP_SYM(memcmp), 4, 0)
31 #define rSTR1 r3 /* first string arg */
32 #define rSTR2 r4 /* second string arg */
33 #define rN r5 /* max string length */
34 #define rWORD1 r6 /* current word in s1 */
35 #define rWORD2 r7 /* current word in s2 */
36 #define rWORD3 r8 /* next word in s1 */
37 #define rWORD4 r9 /* next word in s2 */
38 #define rWORD5 r10 /* next word in s1 */
39 #define rWORD6 r11 /* next word in s2 */
40 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
41 #define rWORD7 r30 /* next word in s1 */
42 #define rWORD8 r31 /* next word in s2 */
44 xor rTMP, rSTR2, rSTR1
47 clrlwi. rTMP, rTMP, 30
48 clrlwi rBITDIF, rSTR1, 30
49 cmplwi cr5, rBITDIF, 0
50 beq- cr6, L(zeroLength)
53 /* If less than 8 bytes or not aligned, use the unaligned
55 blt cr1, L(bytealigned)
57 cfi_adjust_cfa_offset(64)
59 cfi_offset(31,(48-64))
61 cfi_offset(30,(44-64))
63 /* At this point we know both strings have the same alignment and the
64 compare length is at least 8 bytes. rBITDIF contains the low order
65 2 bits of rSTR1 and cr5 contains the result of the logical compare
66 of rBITDIF to 0. If rBITDIF == 0 then we are already word
67 aligned and can perform the word aligned loop.
69 Otherwise we know the two strings have the same alignment (but not
70 yet word aligned). So we force the string addresses to the next lower
71 word boundary and special case this first word using shift left to
72 eliminate bits preceeding the first byte. Since we want to join the
73 normal (word aligned) compare loop, starting at the second word,
74 we need to adjust the length (rN) and special case the loop
75 versioning for the first word. This insures that the loop count is
76 correct and the first word (shifted) is in the expected register pair. */
79 clrrwi rSTR1, rSTR1, 2
80 clrrwi rSTR2, rSTR2, 2
84 srwi rTMP, rN, 4 /* Divide by 16 */
85 andi. rBITDIF, rN, 12 /* Get the word remainder */
88 cmplwi cr1, rBITDIF, 8
92 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
99 slw rWORD5, rWORD1, r11
100 slw rWORD6, rWORD2, r11
101 cmplw cr5, rWORD5, rWORD6
103 /* Do something useful in this cycle since we have to branch anyway. */
106 cmplw cr0, rWORD1, rWORD2
111 slw rWORD5, rWORD1, r11
112 slw rWORD6, rWORD2, r11
113 cmplw cr6, rWORD5, rWORD6
115 /* Do something useful in this cycle since we have to branch anyway. */
118 cmplw cr5, rWORD7, rWORD8
120 /* Remainder is 12 */
123 slw rWORD3, rWORD1, r11
124 slw rWORD4, rWORD2, r11
125 cmplw cr1, rWORD3, rWORD4
127 /* Count is a multiple of 16, remainder is 0 */
130 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
131 slw rWORD1, rWORD1, r11
132 slw rWORD2, rWORD2, r11
133 cmplw cr0, rWORD1, rWORD2
136 /* At this point we know both strings are word aligned and the
137 compare length is at least 8 bytes. */
140 andi. rBITDIF, rN, 12 /* Get the word remainder */
141 srwi rTMP, rN, 4 /* Divide by 16 */
142 cmplwi cr1, rBITDIF, 8
152 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
153 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
154 (8-15 byte compare), we want to use only volatile registers. This
155 means we can avoid restoring non-volatile registers since we did not
156 change any on the early exit path. The key here is the non-early
157 exit path only cares about the condition code (cr5), not about which
158 register pair was used. */
161 cmplw cr5, rWORD5, rWORD6
165 cmplw cr0, rWORD1, rWORD2
169 cmplw cr1, rWORD3, rWORD4
170 lwz rWORD5, 12(rSTR1)
171 lwz rWORD6, 12(rSTR2)
172 cmplw cr6, rWORD5, rWORD6
176 lwzu rWORD7, 16(rSTR1)
177 lwzu rWORD8, 16(rSTR2)
179 cmplw cr5, rWORD7, rWORD8
188 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
197 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
200 cmplw cr6, rWORD5, rWORD6
204 cmplw cr5, rWORD7, rWORD8
208 cmplw cr0, rWORD1, rWORD2
209 lwz rWORD3, 12(rSTR1)
210 lwz rWORD4, 12(rSTR2)
211 cmplw cr1, rWORD3, rWORD4
217 /* Again we are on a early exit path (16-23 byte compare), we want to
218 only use volatile registers and avoid restoring non-volatile
224 cmplw cr5, rWORD3, rWORD4
230 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
236 /* Remainder is 12 */
239 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
242 cmplw cr1, rWORD3, rWORD4
246 cmplw cr6, rWORD5, rWORD6
250 cmplw cr5, rWORD7, rWORD8
251 lwz rWORD1, 12(rSTR1)
252 lwz rWORD2, 12(rSTR2)
253 cmplw cr0, rWORD1, rWORD2
259 /* Again we are on a early exit path (24-31 byte compare), we want to
260 only use volatile registers and avoid restoring non-volatile
266 cmplw cr5, rWORD1, rWORD2
272 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
279 /* Count is a multiple of 16, remainder is 0 */
282 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
285 cmplw cr0, rWORD1, rWORD2
289 cmplw cr1, rWORD3, rWORD4
292 cmplw cr6, rWORD5, rWORD6
293 lwzu rWORD7, 12(rSTR1)
294 lwzu rWORD8, 12(rSTR2)
295 cmplw cr5, rWORD7, rWORD8
298 bdz- L(d24) /* Adjust CTR as we start with +4 */
299 /* This is the primary loop */
304 cmplw cr1, rWORD3, rWORD4
309 cmplw cr6, rWORD5, rWORD6
312 lwz rWORD5, 12(rSTR1)
313 lwz rWORD6, 12(rSTR2)
314 cmplw cr5, rWORD7, rWORD8
317 lwzu rWORD7, 16(rSTR1)
318 lwzu rWORD8, 16(rSTR2)
320 cmplw cr0, rWORD1, rWORD2
324 cmplw cr1, rWORD3, rWORD4
326 cmplw cr6, rWORD5, rWORD6
328 cmplw cr5, rWORD7, rWORD8
342 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
344 /* At this point we have a remainder of 1 to 3 bytes to compare. Since
345 we are aligned it is safe to load the whole word, and use
346 shift right to eliminate bits beyond the compare length. */
350 srw rWORD1, rWORD1, rN
351 srw rWORD2, rWORD2, rN
400 cfi_adjust_cfa_offset(-64)
401 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
403 /* We need to prime this loop. This loop is swing modulo scheduled
404 to avoid pipe delays. The dependent instruction latencies (load to
405 compare to conditional branch) is 2 to 3 cycles. In this loop each
406 dispatch group ends in a branch and takes 1 cycle. Effectively
407 the first iteration of the loop only serves to load operands and
408 branches based on compares are delayed until the next loop.
410 So we must precondition some registers and condition codes so that
411 we don't exit the loop early on the first iteration. */
416 cmplw cr0, rWORD1, rWORD2
420 cmplw cr1, rWORD3, rWORD4
421 lbzu rWORD5, 2(rSTR1)
422 lbzu rWORD6, 2(rSTR2)
426 lbzu rWORD1, 1(rSTR1)
427 lbzu rWORD2, 1(rSTR2)
430 cmplw cr6, rWORD5, rWORD6
433 lbzu rWORD3, 1(rSTR1)
434 lbzu rWORD4, 1(rSTR2)
437 cmplw cr0, rWORD1, rWORD2
440 lbzu rWORD5, 1(rSTR1)
441 lbzu rWORD6, 1(rSTR2)
444 cmplw cr1, rWORD3, rWORD4
447 /* We speculatively loading bytes before we have tested the previous
448 bytes. But we must avoid overrunning the length (in the ctr) to
449 prevent these speculative loads from causing a segfault. In this
450 case the loop will exit early (before the all pending bytes are
451 tested. In this case we must complete the pending operations
488 sub rRTN, rWORD5, rWORD6
494 sub rRTN, rWORD3, rWORD4
499 sub rRTN, rWORD1, rWORD2
509 cfi_adjust_cfa_offset(64)
511 /* At this point we know the strings have different alignment and the
512 compare length is at least 8 bytes. rBITDIF contains the low order
513 2 bits of rSTR1 and cr5 contains the result of the logical compare
514 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
515 perform the Wunaligned loop.
517 Otherwise we know that rSTR1 is not aready word aligned yet.
518 So we can force the string addresses to the next lower word
519 boundary and special case this first word using shift left to
520 eliminate bits preceeding the first byte. Since we want to join the
521 normal (Wualigned) compare loop, starting at the second word,
522 we need to adjust the length (rN) and special case the loop
523 versioning for the first W. This insures that the loop count is
524 correct and the first W (shifted) is in the expected resister pair. */
525 #define rSHL r29 /* Unaligned shift left count. */
526 #define rSHR r28 /* Unaligned shift right count. */
527 #define rB r27 /* Left rotation temp for rWORD2. */
528 #define rD r26 /* Left rotation temp for rWORD4. */
529 #define rF r25 /* Left rotation temp for rWORD6. */
530 #define rH r24 /* Left rotation temp for rWORD8. */
531 #define rA r0 /* Right rotation temp for rWORD2. */
532 #define rC r12 /* Right rotation temp for rWORD4. */
533 #define rE r0 /* Right rotation temp for rWORD6. */
534 #define rG r12 /* Right rotation temp for rWORD8. */
537 cfi_offset(r29,(40-64))
538 clrlwi rSHL, rSTR2, 30
540 cfi_offset(r28,(36-64))
541 beq cr5, L(Wunaligned)
543 cfi_offset(r27,(32-64))
544 /* Adjust the logical start of rSTR2 to compensate for the extra bits
545 in the 1st rSTR1 W. */
546 sub r27, rSTR2, rBITDIF
547 /* But do not attempt to address the W before that W that contains
548 the actual start of rSTR2. */
549 clrrwi rSTR2, rSTR2, 2
551 cfi_offset(r26,(28-64))
552 /* Compute the left/right shift counts for the unalign rSTR2,
553 compensating for the logical (W aligned) start of rSTR1. */
555 clrrwi rSTR1, rSTR1, 2
557 cfi_offset(r25,(24-64))
559 cmplw cr5, r27, rSTR2
563 cfi_offset(r24,(20-64))
564 subfic rSHR, rSHL, 32
565 srwi rTMP, rN, 4 /* Divide by 16 */
566 andi. rBITDIF, rN, 12 /* Get the W remainder */
567 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
568 this special case those bits may be discarded anyway. Also we
569 must avoid loading a W where none of the bits are part of rSTR2 as
570 this may cross a page boundary and cause a page fault. */
575 slw rWORD8, rWORD8, rSHL
580 cmplwi cr1, rBITDIF, 8
585 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
586 or rWORD8, rG, rWORD8
594 slw rWORD7, rWORD1, r11
595 slw rWORD8, rWORD8, r11
597 /* At this point we exit early with the first word compare
598 complete and remainder of 0 to 3 bytes. See L(du14) for details on
599 how we handle the remaining bytes. */
600 cmplw cr5, rWORD7, rWORD8
614 slw rWORD5, rWORD1, r11
615 slw rWORD6, rWORD8, r11
617 /* Remainder is 12 */
621 slw rWORD3, rWORD1, r11
622 slw rWORD4, rWORD8, r11
624 /* Count is a multiple of 16, remainder is 0 */
627 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
628 or rWORD8, rG, rWORD8
630 slw rWORD1, rWORD1, r11
631 slw rWORD2, rWORD8, r11
634 /* At this point we know rSTR1 is word aligned and the
635 compare length is at least 8 bytes. */
639 cfi_offset(r27,(32-64))
640 clrrwi rSTR2, rSTR2, 2
642 cfi_offset(r26,(28-64))
643 srwi rTMP, rN, 4 /* Divide by 16 */
645 cfi_offset(r25,(24-64))
646 andi. rBITDIF, rN, 12 /* Get the W remainder */
648 cfi_offset(r24,(24-64))
651 lwzu rWORD8, 4(rSTR2)
652 cmplwi cr1, rBITDIF, 8
655 subfic rSHR, rSHL, 32
658 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
673 cmplw cr5, rWORD7, rWORD8
679 cmplw cr0, rWORD1, rWORD2
684 lwz rWORD5, 12(rSTR1)
685 lwz rWORD6, 12(rSTR2)
686 cmplw cr1, rWORD3, rWORD4
691 cmplw cr6, rWORD5, rWORD6
694 /* At this point we exit early with the first word compare
695 complete and remainder of 0 to 3 bytes. See L(du14) for details on
696 how we handle the remaining bytes. */
698 cmplw cr5, rWORD7, rWORD8
718 cmplw cr6, rWORD5, rWORD6
725 cmplw cr5, rWORD7, rWORD8
730 lwz rWORD3, 12(rSTR1)
731 lwz rWORD4, 12(rSTR2)
732 cmplw cr0, rWORD1, rWORD2
739 cmplw cr1, rWORD3, rWORD4
743 cmplw cr5, rWORD7, rWORD8
757 /* Remainder is 12 */
767 cmplw cr1, rWORD3, rWORD4
773 cmplw cr6, rWORD5, rWORD6
779 lwz rWORD1, 12(rSTR1)
780 lwz rWORD2, 12(rSTR2)
781 cmplw cr5, rWORD7, rWORD8
788 cmplw cr0, rWORD1, rWORD2
795 cmplw cr5, rWORD7, rWORD8
807 /* Count is a multiple of 16, remainder is 0 */
810 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
818 cmplw cr0, rWORD1, rWORD2
824 cmplw cr1, rWORD3, rWORD4
829 lwzu rWORD7, 12(rSTR1)
830 lwzu rWORD8, 12(rSTR2)
831 cmplw cr6, rWORD5, rWORD6
836 cmplw cr5, rWORD7, rWORD8
837 bdz- L(du24) /* Adjust CTR as we start with +4 */
838 /* This is the primary loop */
843 cmplw cr1, rWORD3, rWORD4
851 cmplw cr6, rWORD5, rWORD6
857 lwz rWORD5, 12(rSTR1)
858 lwz rWORD6, 12(rSTR2)
859 cmplw cr5, rWORD7, rWORD8
865 lwzu rWORD7, 16(rSTR1)
866 lwzu rWORD8, 16(rSTR2)
867 cmplw cr0, rWORD1, rWORD2
876 cmplw cr1, rWORD3, rWORD4
878 cmplw cr6, rWORD5, rWORD6
880 cmplw cr5, rWORD7, rWORD8
890 /* At this point we have a remainder of 1 to 3 bytes to compare. We use
891 shift right to eliminate bits beyond the compare length.
893 However it may not be safe to load rWORD2 which may be beyond the
894 string length. So we compare the bit length of the remainder to
895 the right shift count (rSHR). If the bit count is less than or equal
896 we do not need to load rWORD2 (all significant bits are already in
908 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
912 srw rWORD1, rWORD1, rN
913 srw rWORD2, rWORD2, rN
928 bgt cr0, L(dureturn29)
938 bgt cr1, L(dureturn29)
948 bgt cr6, L(dureturn29)
958 bgt cr5, L(dureturn29)
982 END (BP_SYM (memcmp))
984 libc_hidden_builtin_def (memcmp)
985 weak_alias (memcmp, bcmp)