1 /* Optimized strcmp implementation for PowerPC32.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
30 #define rSTR1 r3 /* first string arg */
31 #define rSTR2 r4 /* second string arg */
32 #define rN r5 /* max string length */
33 #define rWORD1 r6 /* current word in s1 */
34 #define rWORD2 r7 /* current word in s2 */
35 #define rWORD3 r8 /* next word in s1 */
36 #define rWORD4 r9 /* next word in s2 */
37 #define rWORD5 r10 /* next word in s1 */
38 #define rWORD6 r11 /* next word in s2 */
39 #define rWORD7 r30 /* next word in s1 */
40 #define rWORD8 r31 /* next word in s2 */
48 beq- cr6, L(zeroLength)
51 /* If less than 8 bytes or not aligned, use the unaligned
53 blt cr1, L(bytealigned)
55 cfi_adjust_cfa_offset(64)
58 cfi_offset(rWORD8, (48-64))
59 cfi_offset(rWORD7, (44-64))
61 /* At this point we know both strings have the same alignment and the
62 compare length is at least 8 bytes. r12 contains the low order
63 2 bits of rSTR1 and cr5 contains the result of the logical compare
64 of r12 to 0. If r12 == 0 then we are already word
65 aligned and can perform the word aligned loop.
67 Otherwise we know the two strings have the same alignment (but not
68 yet word aligned). So we force the string addresses to the next lower
69 word boundary and special case this first word using shift left to
70 eliminate bits preceding the first byte. Since we want to join the
71 normal (word aligned) compare loop, starting at the second word,
72 we need to adjust the length (rN) and special case the loop
73 versioning for the first word. This ensures that the loop count is
74 correct and the first word (shifted) is in the expected register pair. */
77 clrrwi rSTR1, rSTR1, 2
78 clrrwi rSTR2, rSTR2, 2
82 srwi r0, rN, 4 /* Divide by 16 */
83 andi. r12, rN, 12 /* Get the word remainder */
84 #ifdef __LITTLE_ENDIAN__
85 lwbrx rWORD1, 0, rSTR1
86 lwbrx rWORD2, 0, rSTR2
97 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
104 slw rWORD5, rWORD1, rWORD6
105 slw rWORD6, rWORD2, rWORD6
106 cmplw cr5, rWORD5, rWORD6
108 /* Do something useful in this cycle since we have to branch anyway. */
109 #ifdef __LITTLE_ENDIAN__
110 lwbrx rWORD1, 0, rSTR1
111 lwbrx rWORD2, 0, rSTR2
118 cmplw cr7, rWORD1, rWORD2
123 slw rWORD5, rWORD1, rWORD6
124 slw rWORD6, rWORD2, rWORD6
125 cmplw cr6, rWORD5, rWORD6
127 /* Do something useful in this cycle since we have to branch anyway. */
128 #ifdef __LITTLE_ENDIAN__
129 lwbrx rWORD7, 0, rSTR1
130 lwbrx rWORD8, 0, rSTR2
137 cmplw cr5, rWORD7, rWORD8
139 /* Remainder is 12 */
142 slw rWORD3, rWORD1, rWORD6
143 slw rWORD4, rWORD2, rWORD6
144 cmplw cr1, rWORD3, rWORD4
146 /* Count is a multiple of 16, remainder is 0 */
149 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
150 slw rWORD1, rWORD1, rWORD6
151 slw rWORD2, rWORD2, rWORD6
152 cmplw cr7, rWORD1, rWORD2
155 /* At this point we know both strings are word aligned and the
156 compare length is at least 8 bytes. */
159 andi. r12, rN, 12 /* Get the word remainder */
160 srwi r0, rN, 4 /* Divide by 16 */
171 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
172 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
173 (8-15 byte compare), we want to use only volatile registers. This
174 means we can avoid restoring non-volatile registers since we did not
175 change any on the early exit path. The key here is the non-early
176 exit path only cares about the condition code (cr5), not about which
177 register pair was used. */
178 #ifdef __LITTLE_ENDIAN__
179 lwbrx rWORD5, 0, rSTR1
180 lwbrx rWORD6, 0, rSTR2
187 cmplw cr5, rWORD5, rWORD6
189 #ifdef __LITTLE_ENDIAN__
190 lwbrx rWORD1, 0, rSTR1
191 lwbrx rWORD2, 0, rSTR2
198 cmplw cr7, rWORD1, rWORD2
200 #ifdef __LITTLE_ENDIAN__
201 lwbrx rWORD3, 0, rSTR1
202 lwbrx rWORD4, 0, rSTR2
209 cmplw cr1, rWORD3, rWORD4
210 #ifdef __LITTLE_ENDIAN__
211 lwbrx rWORD5, 0, rSTR1
212 lwbrx rWORD6, 0, rSTR2
216 lwz rWORD5, 12(rSTR1)
217 lwz rWORD6, 12(rSTR2)
219 cmplw cr6, rWORD5, rWORD6
223 #ifdef __LITTLE_ENDIAN__
224 lwbrx rWORD7, 0, rSTR1
225 lwbrx rWORD8, 0, rSTR2
229 lwzu rWORD7, 16(rSTR1)
230 lwzu rWORD8, 16(rSTR2)
233 cmplw cr5, rWORD7, rWORD8
242 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
244 cfi_adjust_cfa_offset(-64)
251 cfi_adjust_cfa_offset(64)
253 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
254 #ifdef __LITTLE_ENDIAN__
255 lwbrx rWORD5, 0, rSTR1
256 lwbrx rWORD6, 0, rSTR2
263 cmplw cr6, rWORD5, rWORD6
265 #ifdef __LITTLE_ENDIAN__
266 lwbrx rWORD7, 0, rSTR1
267 lwbrx rWORD8, 0, rSTR2
274 cmplw cr5, rWORD7, rWORD8
276 #ifdef __LITTLE_ENDIAN__
277 lwbrx rWORD1, 0, rSTR1
278 lwbrx rWORD2, 0, rSTR2
285 cmplw cr7, rWORD1, rWORD2
286 #ifdef __LITTLE_ENDIAN__
287 lwbrx rWORD3, 0, rSTR1
288 lwbrx rWORD4, 0, rSTR2
292 lwz rWORD3, 12(rSTR1)
293 lwz rWORD4, 12(rSTR2)
295 cmplw cr1, rWORD3, rWORD4
296 #ifndef __LITTLE_ENDIAN__
303 /* Again we are on a early exit path (16-23 byte compare), we want to
304 only use volatile registers and avoid restoring non-volatile
308 #ifdef __LITTLE_ENDIAN__
309 lwbrx rWORD3, 0, rSTR1
310 lwbrx rWORD4, 0, rSTR2
317 cmplw cr1, rWORD3, rWORD4
320 #ifndef __LITTLE_ENDIAN__
325 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
327 cfi_adjust_cfa_offset(-64)
332 /* Remainder is 12 */
334 cfi_adjust_cfa_offset(64)
336 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
337 #ifdef __LITTLE_ENDIAN__
338 lwbrx rWORD3, 0, rSTR1
339 lwbrx rWORD4, 0, rSTR2
346 cmplw cr1, rWORD3, rWORD4
348 #ifdef __LITTLE_ENDIAN__
349 lwbrx rWORD5, 0, rSTR1
350 lwbrx rWORD6, 0, rSTR2
357 cmplw cr6, rWORD5, rWORD6
359 #ifdef __LITTLE_ENDIAN__
360 lwbrx rWORD7, 0, rSTR1
361 lwbrx rWORD8, 0, rSTR2
368 cmplw cr5, rWORD7, rWORD8
369 #ifdef __LITTLE_ENDIAN__
370 lwbrx rWORD1, 0, rSTR1
371 lwbrx rWORD2, 0, rSTR2
375 lwz rWORD1, 12(rSTR1)
376 lwz rWORD2, 12(rSTR2)
378 cmplw cr7, rWORD1, rWORD2
379 #ifndef __LITTLE_ENDIAN__
386 /* Again we are on a early exit path (24-31 byte compare), we want to
387 only use volatile registers and avoid restoring non-volatile
391 #ifdef __LITTLE_ENDIAN__
392 lwbrx rWORD1, 0, rSTR1
393 lwbrx rWORD2, 0, rSTR2
400 cmplw cr7, rWORD1, rWORD2
403 #ifndef __LITTLE_ENDIAN__
408 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
411 cfi_adjust_cfa_offset(-64)
416 /* Count is a multiple of 16, remainder is 0 */
418 cfi_adjust_cfa_offset(64)
420 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
421 #ifdef __LITTLE_ENDIAN__
422 lwbrx rWORD1, 0, rSTR1
423 lwbrx rWORD2, 0, rSTR2
430 cmplw cr7, rWORD1, rWORD2
432 #ifdef __LITTLE_ENDIAN__
433 lwbrx rWORD3, 0, rSTR1
434 lwbrx rWORD4, 0, rSTR2
441 cmplw cr1, rWORD3, rWORD4
442 #ifdef __LITTLE_ENDIAN__
443 lwbrx rWORD5, 0, rSTR1
444 lwbrx rWORD6, 0, rSTR2
451 cmplw cr6, rWORD5, rWORD6
452 #ifdef __LITTLE_ENDIAN__
453 lwbrx rWORD7, 0, rSTR1
454 lwbrx rWORD8, 0, rSTR2
458 lwzu rWORD7, 12(rSTR1)
459 lwzu rWORD8, 12(rSTR2)
461 cmplw cr5, rWORD7, rWORD8
464 bdz- L(d24) /* Adjust CTR as we start with +4 */
465 /* This is the primary loop */
468 #ifdef __LITTLE_ENDIAN__
469 lwbrx rWORD1, 0, rSTR1
470 lwbrx rWORD2, 0, rSTR2
477 cmplw cr1, rWORD3, rWORD4
480 #ifdef __LITTLE_ENDIAN__
481 lwbrx rWORD3, 0, rSTR1
482 lwbrx rWORD4, 0, rSTR2
489 cmplw cr6, rWORD5, rWORD6
492 #ifdef __LITTLE_ENDIAN__
493 lwbrx rWORD5, 0, rSTR1
494 lwbrx rWORD6, 0, rSTR2
498 lwz rWORD5, 12(rSTR1)
499 lwz rWORD6, 12(rSTR2)
501 cmplw cr5, rWORD7, rWORD8
504 #ifdef __LITTLE_ENDIAN__
505 lwbrx rWORD7, 0, rSTR1
506 lwbrx rWORD8, 0, rSTR2
510 lwzu rWORD7, 16(rSTR1)
511 lwzu rWORD8, 16(rSTR2)
514 cmplw cr7, rWORD1, rWORD2
518 cmplw cr1, rWORD3, rWORD4
520 cmplw cr6, rWORD5, rWORD6
522 cmplw cr5, rWORD7, rWORD8
536 cfi_adjust_cfa_offset(-64)
537 subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
539 /* At this point we have a remainder of 1 to 3 bytes to compare. Since
540 we are aligned it is safe to load the whole word, and use
541 shift right to eliminate bits beyond the compare length. */
543 #ifdef __LITTLE_ENDIAN__
544 lwbrx rWORD1, 0, rSTR1
545 lwbrx rWORD2, 0, rSTR2
552 srw rWORD1, rWORD1, rN
553 srw rWORD2, rWORD2, rN
554 sub rRTN, rWORD1, rWORD2
558 cfi_adjust_cfa_offset(64)
565 cfi_adjust_cfa_offset(-64)
570 cfi_adjust_cfa_offset(64)
577 cfi_adjust_cfa_offset(-64)
582 cfi_adjust_cfa_offset(64)
589 cfi_adjust_cfa_offset(-64)
594 cfi_adjust_cfa_offset(64)
601 cfi_adjust_cfa_offset(-64)
608 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
610 /* We need to prime this loop. This loop is swing modulo scheduled
611 to avoid pipe delays. The dependent instruction latencies (load to
612 compare to conditional branch) is 2 to 3 cycles. In this loop each
613 dispatch group ends in a branch and takes 1 cycle. Effectively
614 the first iteration of the loop only serves to load operands and
615 branches based on compares are delayed until the next loop.
617 So we must precondition some registers and condition codes so that
618 we don't exit the loop early on the first iteration. */
623 cmplw cr7, rWORD1, rWORD2
627 cmplw cr1, rWORD3, rWORD4
628 lbzu rWORD5, 2(rSTR1)
629 lbzu rWORD6, 2(rSTR2)
633 lbzu rWORD1, 1(rSTR1)
634 lbzu rWORD2, 1(rSTR2)
637 cmplw cr6, rWORD5, rWORD6
640 lbzu rWORD3, 1(rSTR1)
641 lbzu rWORD4, 1(rSTR2)
644 cmplw cr7, rWORD1, rWORD2
647 lbzu rWORD5, 1(rSTR1)
648 lbzu rWORD6, 1(rSTR2)
651 cmplw cr1, rWORD3, rWORD4
654 /* We speculatively loading bytes before we have tested the previous
655 bytes. But we must avoid overrunning the length (in the ctr) to
656 prevent these speculative loads from causing a segfault. In this
657 case the loop will exit early (before the all pending bytes are
658 tested. In this case we must complete the pending operations
695 sub rRTN, rWORD5, rWORD6
701 sub rRTN, rWORD3, rWORD4
705 sub rRTN, rWORD1, rWORD2
713 /* At this point we know the strings have different alignment and the
714 compare length is at least 8 bytes. r12 contains the low order
715 2 bits of rSTR1 and cr5 contains the result of the logical compare
716 of r12 to 0. If r12 == 0 then rStr1 is word aligned and can
717 perform the Wunaligned loop.
719 Otherwise we know that rSTR1 is not already word aligned yet.
720 So we can force the string addresses to the next lower word
721 boundary and special case this first word using shift left to
722 eliminate bits preceding the first byte. Since we want to join the
723 normal (Wualigned) compare loop, starting at the second word,
724 we need to adjust the length (rN) and special case the loop
725 versioning for the first W. This ensures that the loop count is
726 correct and the first W (shifted) is in the expected resister pair. */
727 #define rSHL r29 /* Unaligned shift left count. */
728 #define rSHR r28 /* Unaligned shift right count. */
729 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
730 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
731 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
732 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
733 cfi_adjust_cfa_offset(64)
736 cfi_offset(rSHL, (40-64))
737 clrlwi rSHL, rSTR2, 30
739 cfi_offset(rSHR, (36-64))
740 beq cr5, L(Wunaligned)
741 stw rWORD8_SHIFT, 32(r1)
742 cfi_offset(rWORD8_SHIFT, (32-64))
743 /* Adjust the logical start of rSTR2 to compensate for the extra bits
744 in the 1st rSTR1 W. */
745 sub rWORD8_SHIFT, rSTR2, r12
746 /* But do not attempt to address the W before that W that contains
747 the actual start of rSTR2. */
748 clrrwi rSTR2, rSTR2, 2
749 stw rWORD2_SHIFT, 28(r1)
750 /* Compute the left/right shift counts for the unaligned rSTR2,
751 compensating for the logical (W aligned) start of rSTR1. */
752 clrlwi rSHL, rWORD8_SHIFT, 30
753 clrrwi rSTR1, rSTR1, 2
754 stw rWORD4_SHIFT, 24(r1)
756 cmplw cr5, rWORD8_SHIFT, rSTR2
759 stw rWORD6_SHIFT, 20(r1)
760 cfi_offset(rWORD2_SHIFT, (28-64))
761 cfi_offset(rWORD4_SHIFT, (24-64))
762 cfi_offset(rWORD6_SHIFT, (20-64))
763 subfic rSHR, rSHL, 32
764 srwi r0, rN, 4 /* Divide by 16 */
765 andi. r12, rN, 12 /* Get the W remainder */
766 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
767 this special case those bits may be discarded anyway. Also we
768 must avoid loading a W where none of the bits are part of rSTR2 as
769 this may cross a page boundary and cause a page fault. */
772 #ifdef __LITTLE_ENDIAN__
773 lwbrx rWORD8, 0, rSTR2
779 slw rWORD8, rWORD8, rSHL
782 #ifdef __LITTLE_ENDIAN__
783 lwbrx rWORD1, 0, rSTR1
784 lwbrx rWORD2, 0, rSTR2
793 srw r12, rWORD2, rSHR
796 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
797 or rWORD8, r12, rWORD8
804 slw rWORD8_SHIFT, rWORD2, rSHL
805 slw rWORD7, rWORD1, rWORD6
806 slw rWORD8, rWORD8, rWORD6
808 /* At this point we exit early with the first word compare
809 complete and remainder of 0 to 3 bytes. See L(du14) for details on
810 how we handle the remaining bytes. */
811 cmplw cr5, rWORD7, rWORD8
818 #ifdef __LITTLE_ENDIAN__
819 lwbrx rWORD2, 0, rSTR2
829 slw rWORD6_SHIFT, rWORD2, rSHL
830 slw rWORD5, rWORD1, rWORD6
831 slw rWORD6, rWORD8, rWORD6
833 /* Remainder is 12 */
836 slw rWORD4_SHIFT, rWORD2, rSHL
837 slw rWORD3, rWORD1, rWORD6
838 slw rWORD4, rWORD8, rWORD6
840 /* Count is a multiple of 16, remainder is 0 */
843 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
844 or rWORD8, r12, rWORD8
845 slw rWORD2_SHIFT, rWORD2, rSHL
846 slw rWORD1, rWORD1, rWORD6
847 slw rWORD2, rWORD8, rWORD6
850 /* At this point we know rSTR1 is word aligned and the
851 compare length is at least 8 bytes. */
854 stw rWORD8_SHIFT, 32(r1)
855 clrrwi rSTR2, rSTR2, 2
856 stw rWORD2_SHIFT, 28(r1)
857 srwi r0, rN, 4 /* Divide by 16 */
858 stw rWORD4_SHIFT, 24(r1)
859 andi. r12, rN, 12 /* Get the W remainder */
860 stw rWORD6_SHIFT, 20(r1)
861 cfi_offset(rWORD8_SHIFT, (32-64))
862 cfi_offset(rWORD2_SHIFT, (28-64))
863 cfi_offset(rWORD4_SHIFT, (24-64))
864 cfi_offset(rWORD6_SHIFT, (20-64))
866 #ifdef __LITTLE_ENDIAN__
867 lwbrx rWORD6, 0, rSTR2
869 lwbrx rWORD8, 0, rSTR2
873 lwzu rWORD8, 4(rSTR2)
878 subfic rSHR, rSHL, 32
879 slw rWORD6_SHIFT, rWORD6, rSHL
881 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
888 srw r12, rWORD8, rSHR
889 #ifdef __LITTLE_ENDIAN__
890 lwbrx rWORD7, 0, rSTR1
895 slw rWORD8_SHIFT, rWORD8, rSHL
896 or rWORD8, r12, rWORD6_SHIFT
899 #ifdef __LITTLE_ENDIAN__
900 lwbrx rWORD1, 0, rSTR1
901 lwbrx rWORD2, 0, rSTR2
908 cmplw cr5, rWORD7, rWORD8
910 slw rWORD2_SHIFT, rWORD2, rSHL
911 or rWORD2, r0, rWORD8_SHIFT
912 #ifdef __LITTLE_ENDIAN__
913 lwbrx rWORD3, 0, rSTR1
914 lwbrx rWORD4, 0, rSTR2
921 cmplw cr7, rWORD1, rWORD2
922 srw r12, rWORD4, rSHR
923 slw rWORD4_SHIFT, rWORD4, rSHL
925 or rWORD4, r12, rWORD2_SHIFT
926 #ifdef __LITTLE_ENDIAN__
927 lwbrx rWORD5, 0, rSTR1
928 lwbrx rWORD6, 0, rSTR2
932 lwz rWORD5, 12(rSTR1)
933 lwz rWORD6, 12(rSTR2)
935 cmplw cr1, rWORD3, rWORD4
937 slw rWORD6_SHIFT, rWORD6, rSHL
939 or rWORD6, r0, rWORD4_SHIFT
940 cmplw cr6, rWORD5, rWORD6
943 /* At this point we exit early with the first word compare
944 complete and remainder of 0 to 3 bytes. See L(du14) for details on
945 how we handle the remaining bytes. */
947 cmplw cr5, rWORD7, rWORD8
954 #ifdef __LITTLE_ENDIAN__
955 lwbrx rWORD2, 0, rSTR2
966 #ifdef __LITTLE_ENDIAN__
967 lwbrx rWORD5, 0, rSTR1
972 or rWORD6, r0, rWORD6_SHIFT
973 slw rWORD6_SHIFT, rWORD8, rSHL
975 #ifdef __LITTLE_ENDIAN__
976 lwbrx rWORD7, 0, rSTR1
977 lwbrx rWORD8, 0, rSTR2
984 cmplw cr6, rWORD5, rWORD6
985 srw r12, rWORD8, rSHR
986 slw rWORD8_SHIFT, rWORD8, rSHL
987 or rWORD8, r12, rWORD6_SHIFT
989 #ifdef __LITTLE_ENDIAN__
990 lwbrx rWORD1, 0, rSTR1
991 lwbrx rWORD2, 0, rSTR2
998 cmplw cr5, rWORD7, rWORD8
1000 srw r0, rWORD2, rSHR
1001 slw rWORD2_SHIFT, rWORD2, rSHL
1002 or rWORD2, r0, rWORD8_SHIFT
1003 #ifdef __LITTLE_ENDIAN__
1004 lwbrx rWORD3, 0, rSTR1
1005 lwbrx rWORD4, 0, rSTR2
1006 addi rSTR1, rSTR1, 4
1007 addi rSTR2, rSTR2, 4
1009 lwz rWORD3, 12(rSTR1)
1010 lwz rWORD4, 12(rSTR2)
1012 cmplw cr7, rWORD1, rWORD2
1014 srw r12, rWORD4, rSHR
1015 slw rWORD4_SHIFT, rWORD4, rSHL
1016 or rWORD4, r12, rWORD2_SHIFT
1017 #ifndef __LITTLE_ENDIAN__
1018 addi rSTR1, rSTR1, 4
1019 addi rSTR2, rSTR2, 4
1021 cmplw cr1, rWORD3, rWORD4
1025 cmplw cr5, rWORD7, rWORD8
1026 #ifndef __LITTLE_ENDIAN__
1027 addi rSTR1, rSTR1, 4
1028 addi rSTR2, rSTR2, 4
1037 #ifdef __LITTLE_ENDIAN__
1038 lwbrx rWORD2, 0, rSTR2
1039 addi rSTR2, rSTR2, 4
1041 lwz rWORD2, 4(rSTR2)
1043 srw r0, rWORD2, rSHR
1046 /* Remainder is 12 */
1049 srw r12, rWORD8, rSHR
1050 #ifdef __LITTLE_ENDIAN__
1051 lwbrx rWORD3, 0, rSTR1
1052 addi rSTR1, rSTR1, 4
1054 lwz rWORD3, 0(rSTR1)
1056 slw rWORD4_SHIFT, rWORD8, rSHL
1057 or rWORD4, r12, rWORD6_SHIFT
1059 #ifdef __LITTLE_ENDIAN__
1060 lwbrx rWORD5, 0, rSTR1
1061 lwbrx rWORD6, 0, rSTR2
1062 addi rSTR1, rSTR1, 4
1063 addi rSTR2, rSTR2, 4
1065 lwz rWORD5, 4(rSTR1)
1066 lwz rWORD6, 4(rSTR2)
1068 cmplw cr1, rWORD3, rWORD4
1069 srw r0, rWORD6, rSHR
1070 slw rWORD6_SHIFT, rWORD6, rSHL
1071 or rWORD6, r0, rWORD4_SHIFT
1072 #ifdef __LITTLE_ENDIAN__
1073 lwbrx rWORD7, 0, rSTR1
1074 lwbrx rWORD8, 0, rSTR2
1075 addi rSTR1, rSTR1, 4
1076 addi rSTR2, rSTR2, 4
1078 lwz rWORD7, 8(rSTR1)
1079 lwz rWORD8, 8(rSTR2)
1081 cmplw cr6, rWORD5, rWORD6
1083 srw r12, rWORD8, rSHR
1084 slw rWORD8_SHIFT, rWORD8, rSHL
1085 or rWORD8, r12, rWORD6_SHIFT
1087 #ifdef __LITTLE_ENDIAN__
1088 lwbrx rWORD1, 0, rSTR1
1089 lwbrx rWORD2, 0, rSTR2
1090 addi rSTR1, rSTR1, 4
1091 addi rSTR2, rSTR2, 4
1093 lwz rWORD1, 12(rSTR1)
1094 lwz rWORD2, 12(rSTR2)
1096 cmplw cr5, rWORD7, rWORD8
1098 srw r0, rWORD2, rSHR
1099 slw rWORD2_SHIFT, rWORD2, rSHL
1100 or rWORD2, r0, rWORD8_SHIFT
1101 #ifndef __LITTLE_ENDIAN__
1102 addi rSTR1, rSTR1, 8
1103 addi rSTR2, rSTR2, 8
1105 cmplw cr7, rWORD1, rWORD2
1109 #ifndef __LITTLE_ENDIAN__
1110 addi rSTR1, rSTR1, 8
1111 addi rSTR2, rSTR2, 8
1114 /* Huh? We've already branched on cr1! */
1117 cmplw cr5, rWORD7, rWORD8
1125 #ifdef __LITTLE_ENDIAN__
1126 lwbrx rWORD2, 0, rSTR2
1127 addi rSTR2, rSTR2, 4
1129 lwz rWORD2, 4(rSTR2)
1131 srw r0, rWORD2, rSHR
1134 /* Count is a multiple of 16, remainder is 0 */
1137 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1138 srw r0, rWORD8, rSHR
1139 #ifdef __LITTLE_ENDIAN__
1140 lwbrx rWORD1, 0, rSTR1
1141 addi rSTR1, rSTR1, 4
1143 lwz rWORD1, 0(rSTR1)
1145 slw rWORD2_SHIFT, rWORD8, rSHL
1146 or rWORD2, r0, rWORD6_SHIFT
1148 #ifdef __LITTLE_ENDIAN__
1149 lwbrx rWORD3, 0, rSTR1
1150 lwbrx rWORD4, 0, rSTR2
1151 addi rSTR1, rSTR1, 4
1152 addi rSTR2, rSTR2, 4
1154 lwz rWORD3, 4(rSTR1)
1155 lwz rWORD4, 4(rSTR2)
1157 cmplw cr7, rWORD1, rWORD2
1158 srw r12, rWORD4, rSHR
1159 slw rWORD4_SHIFT, rWORD4, rSHL
1160 or rWORD4, r12, rWORD2_SHIFT
1161 #ifdef __LITTLE_ENDIAN__
1162 lwbrx rWORD5, 0, rSTR1
1163 lwbrx rWORD6, 0, rSTR2
1164 addi rSTR1, rSTR1, 4
1165 addi rSTR2, rSTR2, 4
1167 lwz rWORD5, 8(rSTR1)
1168 lwz rWORD6, 8(rSTR2)
1170 cmplw cr1, rWORD3, rWORD4
1172 srw r0, rWORD6, rSHR
1173 slw rWORD6_SHIFT, rWORD6, rSHL
1174 or rWORD6, r0, rWORD4_SHIFT
1175 #ifdef __LITTLE_ENDIAN__
1176 lwbrx rWORD7, 0, rSTR1
1177 lwbrx rWORD8, 0, rSTR2
1178 addi rSTR1, rSTR1, 4
1179 addi rSTR2, rSTR2, 4
1181 lwzu rWORD7, 12(rSTR1)
1182 lwzu rWORD8, 12(rSTR2)
1184 cmplw cr6, rWORD5, rWORD6
1186 srw r12, rWORD8, rSHR
1187 slw rWORD8_SHIFT, rWORD8, rSHL
1188 or rWORD8, r12, rWORD6_SHIFT
1189 cmplw cr5, rWORD7, rWORD8
1190 bdz- L(du24) /* Adjust CTR as we start with +4 */
1191 /* This is the primary loop */
1194 #ifdef __LITTLE_ENDIAN__
1195 lwbrx rWORD1, 0, rSTR1
1196 lwbrx rWORD2, 0, rSTR2
1197 addi rSTR1, rSTR1, 4
1198 addi rSTR2, rSTR2, 4
1200 lwz rWORD1, 4(rSTR1)
1201 lwz rWORD2, 4(rSTR2)
1203 cmplw cr1, rWORD3, rWORD4
1205 srw r0, rWORD2, rSHR
1206 slw rWORD2_SHIFT, rWORD2, rSHL
1207 or rWORD2, r0, rWORD8_SHIFT
1209 #ifdef __LITTLE_ENDIAN__
1210 lwbrx rWORD3, 0, rSTR1
1211 lwbrx rWORD4, 0, rSTR2
1212 addi rSTR1, rSTR1, 4
1213 addi rSTR2, rSTR2, 4
1215 lwz rWORD3, 8(rSTR1)
1216 lwz rWORD4, 8(rSTR2)
1218 cmplw cr6, rWORD5, rWORD6
1220 srw r12, rWORD4, rSHR
1221 slw rWORD4_SHIFT, rWORD4, rSHL
1222 or rWORD4, r12, rWORD2_SHIFT
1224 #ifdef __LITTLE_ENDIAN__
1225 lwbrx rWORD5, 0, rSTR1
1226 lwbrx rWORD6, 0, rSTR2
1227 addi rSTR1, rSTR1, 4
1228 addi rSTR2, rSTR2, 4
1230 lwz rWORD5, 12(rSTR1)
1231 lwz rWORD6, 12(rSTR2)
1233 cmplw cr5, rWORD7, rWORD8
1235 srw r0, rWORD6, rSHR
1236 slw rWORD6_SHIFT, rWORD6, rSHL
1237 or rWORD6, r0, rWORD4_SHIFT
1239 #ifdef __LITTLE_ENDIAN__
1240 lwbrx rWORD7, 0, rSTR1
1241 lwbrx rWORD8, 0, rSTR2
1242 addi rSTR1, rSTR1, 4
1243 addi rSTR2, rSTR2, 4
1245 lwzu rWORD7, 16(rSTR1)
1246 lwzu rWORD8, 16(rSTR2)
1248 cmplw cr7, rWORD1, rWORD2
1250 srw r12, rWORD8, rSHR
1251 slw rWORD8_SHIFT, rWORD8, rSHL
1252 or rWORD8, r12, rWORD6_SHIFT
1257 /* Huh? We've already branched on cr1! */
1260 cmplw cr1, rWORD3, rWORD4
1262 cmplw cr6, rWORD5, rWORD6
1264 cmplw cr5, rWORD7, rWORD8
1274 /* At this point we have a remainder of 1 to 3 bytes to compare. We use
1275 shift right to eliminate bits beyond the compare length.
1276 This allows the use of word subtract to compute the final result.
1278 However it may not be safe to load rWORD2 which may be beyond the
1279 string length. So we compare the bit length of the remainder to
1280 the right shift count (rSHR). If the bit count is less than or equal
1281 we do not need to load rWORD2 (all significant bits are already in
1287 #ifdef __LITTLE_ENDIAN__
1288 lwbrx rWORD2, 0, rSTR2
1289 addi rSTR2, rSTR2, 4
1291 lwz rWORD2, 4(rSTR2)
1293 srw r0, rWORD2, rSHR
1296 #ifdef __LITTLE_ENDIAN__
1297 lwbrx rWORD1, 0, rSTR1
1299 lwz rWORD1, 4(rSTR1)
1302 subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
1303 or rWORD2, r0, rWORD8_SHIFT
1306 srw rWORD1, rWORD1, rN
1307 srw rWORD2, rWORD2, rN
1309 lwz rWORD8_SHIFT, 32(r1)
1310 sub rRTN, rWORD1, rWORD2
1317 bgt cr7, L(dureturn29)
1327 bgt cr1, L(dureturn29)
1337 bgt cr6, L(dureturn29)
1347 bgt cr5, L(dureturn29)
1363 lwz rWORD8_SHIFT, 32(r1)
1365 lwz rWORD2_SHIFT, 28(r1)
1367 lwz rWORD4_SHIFT, 24(r1)
1368 lwz rWORD6_SHIFT, 20(r1)
1370 cfi_adjust_cfa_offset(-64)
1374 libc_hidden_builtin_def (memcmp)
1375 weak_alias (memcmp, bcmp)