1 /* Optimized memcmp implementation for PowerPC64.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
30 #define rSTR1 r3 /* first string arg */
31 #define rSTR2 r4 /* second string arg */
32 #define rN r5 /* max string length */
33 #define rWORD1 r6 /* current word in s1 */
34 #define rWORD2 r7 /* current word in s2 */
35 #define rWORD3 r8 /* next word in s1 */
36 #define rWORD4 r9 /* next word in s2 */
37 #define rWORD5 r10 /* next word in s1 */
38 #define rWORD6 r11 /* next word in s2 */
39 #define rWORD7 r30 /* next word in s1 */
40 #define rWORD8 r31 /* next word in s2 */
48 beq- cr6, L(zeroLength)
51 /* If less than 8 bytes or not aligned, use the unaligned
53 blt cr1, L(bytealigned)
56 cfi_offset(rWORD8, -8)
57 cfi_offset(rWORD7, -16)
59 /* At this point we know both strings have the same alignment and the
60 compare length is at least 8 bytes. r12 contains the low order
61 3 bits of rSTR1 and cr5 contains the result of the logical compare
62 of r12 to 0. If r12 == 0 then we are already double word
63 aligned and can perform the DW aligned loop.
65 Otherwise we know the two strings have the same alignment (but not
66 yet DW). So we force the string addresses to the next lower DW
67 boundary and special case this first DW using shift left to
68 eliminate bits preceding the first byte. Since we want to join the
69 normal (DW aligned) compare loop, starting at the second double word,
70 we need to adjust the length (rN) and special case the loop
71 versioning for the first DW. This ensures that the loop count is
72 correct and the first DW (shifted) is in the expected register pair. */
75 clrrdi rSTR1, rSTR1, 3
76 clrrdi rSTR2, rSTR2, 3
80 srdi r0, rN, 5 /* Divide by 32 */
81 andi. r12, rN, 24 /* Get the DW remainder */
82 #ifdef __LITTLE_ENDIAN__
83 ldbrx rWORD1, 0, rSTR1
84 ldbrx rWORD2, 0, rSTR2
95 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
102 sld rWORD5, rWORD1, rWORD6
103 sld rWORD6, rWORD2, rWORD6
104 cmpld cr5, rWORD5, rWORD6
106 /* Do something useful in this cycle since we have to branch anyway. */
107 #ifdef __LITTLE_ENDIAN__
108 ldbrx rWORD1, 0, rSTR1
109 ldbrx rWORD2, 0, rSTR2
116 cmpld cr7, rWORD1, rWORD2
118 /* Remainder is 16 */
121 sld rWORD5, rWORD1, rWORD6
122 sld rWORD6, rWORD2, rWORD6
123 cmpld cr6, rWORD5, rWORD6
125 /* Do something useful in this cycle since we have to branch anyway. */
126 #ifdef __LITTLE_ENDIAN__
127 ldbrx rWORD7, 0, rSTR1
128 ldbrx rWORD8, 0, rSTR2
135 cmpld cr5, rWORD7, rWORD8
137 /* Remainder is 24 */
140 sld rWORD3, rWORD1, rWORD6
141 sld rWORD4, rWORD2, rWORD6
142 cmpld cr1, rWORD3, rWORD4
144 /* Count is a multiple of 32, remainder is 0 */
147 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
148 sld rWORD1, rWORD1, rWORD6
149 sld rWORD2, rWORD2, rWORD6
150 cmpld cr7, rWORD1, rWORD2
153 /* At this point we know both strings are double word aligned and the
154 compare length is at least 8 bytes. */
157 andi. r12, rN, 24 /* Get the DW remainder */
158 srdi r0, rN, 5 /* Divide by 32 */
169 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
170 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
171 (8-15 byte compare), we want to use only volatile registers. This
172 means we can avoid restoring non-volatile registers since we did not
173 change any on the early exit path. The key here is the non-early
174 exit path only cares about the condition code (cr5), not about which
175 register pair was used. */
176 #ifdef __LITTLE_ENDIAN__
177 ldbrx rWORD5, 0, rSTR1
178 ldbrx rWORD6, 0, rSTR2
185 cmpld cr5, rWORD5, rWORD6
187 #ifdef __LITTLE_ENDIAN__
188 ldbrx rWORD1, 0, rSTR1
189 ldbrx rWORD2, 0, rSTR2
196 cmpld cr7, rWORD1, rWORD2
198 #ifdef __LITTLE_ENDIAN__
199 ldbrx rWORD3, 0, rSTR1
200 ldbrx rWORD4, 0, rSTR2
207 cmpld cr1, rWORD3, rWORD4
208 #ifdef __LITTLE_ENDIAN__
209 ldbrx rWORD5, 0, rSTR1
210 ldbrx rWORD6, 0, rSTR2
217 cmpld cr6, rWORD5, rWORD6
221 #ifdef __LITTLE_ENDIAN__
222 ldbrx rWORD7, 0, rSTR1
223 ldbrx rWORD8, 0, rSTR2
227 ldu rWORD7, 32(rSTR1)
228 ldu rWORD8, 32(rSTR2)
231 cmpld cr5, rWORD7, rWORD8
240 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
245 /* Remainder is 16 */
248 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
249 #ifdef __LITTLE_ENDIAN__
250 ldbrx rWORD5, 0, rSTR1
251 ldbrx rWORD6, 0, rSTR2
258 cmpld cr6, rWORD5, rWORD6
260 #ifdef __LITTLE_ENDIAN__
261 ldbrx rWORD7, 0, rSTR1
262 ldbrx rWORD8, 0, rSTR2
269 cmpld cr5, rWORD7, rWORD8
271 #ifdef __LITTLE_ENDIAN__
272 ldbrx rWORD1, 0, rSTR1
273 ldbrx rWORD2, 0, rSTR2
280 cmpld cr7, rWORD1, rWORD2
281 #ifdef __LITTLE_ENDIAN__
282 ldbrx rWORD3, 0, rSTR1
283 ldbrx rWORD4, 0, rSTR2
290 cmpld cr1, rWORD3, rWORD4
291 #ifndef __LITTLE_ENDIAN__
298 /* Again we are on a early exit path (16-23 byte compare), we want to
299 only use volatile registers and avoid restoring non-volatile
303 #ifdef __LITTLE_ENDIAN__
304 ldbrx rWORD3, 0, rSTR1
305 ldbrx rWORD4, 0, rSTR2
312 cmpld cr1, rWORD3, rWORD4
315 #ifndef __LITTLE_ENDIAN__
320 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
325 /* Remainder is 24 */
328 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
329 #ifdef __LITTLE_ENDIAN__
330 ldbrx rWORD3, 0, rSTR1
331 ldbrx rWORD4, 0, rSTR2
338 cmpld cr1, rWORD3, rWORD4
340 #ifdef __LITTLE_ENDIAN__
341 ldbrx rWORD5, 0, rSTR1
342 ldbrx rWORD6, 0, rSTR2
349 cmpld cr6, rWORD5, rWORD6
351 #ifdef __LITTLE_ENDIAN__
352 ldbrx rWORD7, 0, rSTR1
353 ldbrx rWORD8, 0, rSTR2
360 cmpld cr5, rWORD7, rWORD8
361 #ifdef __LITTLE_ENDIAN__
362 ldbrx rWORD1, 0, rSTR1
363 ldbrx rWORD2, 0, rSTR2
370 cmpld cr7, rWORD1, rWORD2
371 #ifndef __LITTLE_ENDIAN__
372 addi rSTR1, rSTR1, 16
373 addi rSTR2, rSTR2, 16
378 /* Again we are on a early exit path (24-31 byte compare), we want to
379 only use volatile registers and avoid restoring non-volatile
383 #ifdef __LITTLE_ENDIAN__
384 ldbrx rWORD1, 0, rSTR1
385 ldbrx rWORD2, 0, rSTR2
392 cmpld cr7, rWORD1, rWORD2
395 #ifndef __LITTLE_ENDIAN__
396 addi rSTR1, rSTR1, 16
397 addi rSTR2, rSTR2, 16
400 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
406 /* Count is a multiple of 32, remainder is 0 */
409 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
410 #ifdef __LITTLE_ENDIAN__
411 ldbrx rWORD1, 0, rSTR1
412 ldbrx rWORD2, 0, rSTR2
419 cmpld cr7, rWORD1, rWORD2
421 #ifdef __LITTLE_ENDIAN__
422 ldbrx rWORD3, 0, rSTR1
423 ldbrx rWORD4, 0, rSTR2
430 cmpld cr1, rWORD3, rWORD4
431 #ifdef __LITTLE_ENDIAN__
432 ldbrx rWORD5, 0, rSTR1
433 ldbrx rWORD6, 0, rSTR2
440 cmpld cr6, rWORD5, rWORD6
441 #ifdef __LITTLE_ENDIAN__
442 ldbrx rWORD7, 0, rSTR1
443 ldbrx rWORD8, 0, rSTR2
447 ldu rWORD7, 24(rSTR1)
448 ldu rWORD8, 24(rSTR2)
450 cmpld cr5, rWORD7, rWORD8
453 bdz- L(d24) /* Adjust CTR as we start with +4 */
454 /* This is the primary loop */
457 #ifdef __LITTLE_ENDIAN__
458 ldbrx rWORD1, 0, rSTR1
459 ldbrx rWORD2, 0, rSTR2
466 cmpld cr1, rWORD3, rWORD4
469 #ifdef __LITTLE_ENDIAN__
470 ldbrx rWORD3, 0, rSTR1
471 ldbrx rWORD4, 0, rSTR2
478 cmpld cr6, rWORD5, rWORD6
481 #ifdef __LITTLE_ENDIAN__
482 ldbrx rWORD5, 0, rSTR1
483 ldbrx rWORD6, 0, rSTR2
490 cmpld cr5, rWORD7, rWORD8
493 #ifdef __LITTLE_ENDIAN__
494 ldbrx rWORD7, 0, rSTR1
495 ldbrx rWORD8, 0, rSTR2
499 ldu rWORD7, 32(rSTR1)
500 ldu rWORD8, 32(rSTR2)
503 cmpld cr7, rWORD1, rWORD2
507 cmpld cr1, rWORD3, rWORD4
509 cmpld cr6, rWORD5, rWORD6
511 cmpld cr5, rWORD7, rWORD8
524 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
526 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
527 we are aligned it is safe to load the whole double word, and use
528 shift right double to eliminate bits beyond the compare length. */
530 #ifdef __LITTLE_ENDIAN__
531 ldbrx rWORD1, 0, rSTR1
532 ldbrx rWORD2, 0, rSTR2
539 srd rWORD1, rWORD1, rN
540 srd rWORD2, rWORD2, rN
541 cmpld cr7, rWORD1, rWORD2
585 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
587 /* Huh? We've already branched on cr6! */
588 beq- cr6, L(zeroLength)
591 /* We need to prime this loop. This loop is swing modulo scheduled
592 to avoid pipe delays. The dependent instruction latencies (load to
593 compare to conditional branch) is 2 to 3 cycles. In this loop each
594 dispatch group ends in a branch and takes 1 cycle. Effectively
595 the first iteration of the loop only serves to load operands and
596 branches based on compares are delayed until the next loop.
598 So we must precondition some registers and condition codes so that
599 we don't exit the loop early on the first iteration. */
604 cmpld cr7, rWORD1, rWORD2
608 cmpld cr1, rWORD3, rWORD4
609 lbzu rWORD5, 2(rSTR1)
610 lbzu rWORD6, 2(rSTR2)
614 lbzu rWORD1, 1(rSTR1)
615 lbzu rWORD2, 1(rSTR2)
618 cmpld cr6, rWORD5, rWORD6
621 lbzu rWORD3, 1(rSTR1)
622 lbzu rWORD4, 1(rSTR2)
625 cmpld cr7, rWORD1, rWORD2
628 lbzu rWORD5, 1(rSTR1)
629 lbzu rWORD6, 1(rSTR2)
632 cmpld cr1, rWORD3, rWORD4
635 /* We speculatively loading bytes before we have tested the previous
636 bytes. But we must avoid overrunning the length (in the ctr) to
637 prevent these speculative loads from causing a segfault. In this
638 case the loop will exit early (before the all pending bytes are
639 tested. In this case we must complete the pending operations
676 sub rRTN, rWORD5, rWORD6
682 sub rRTN, rWORD3, rWORD4
686 sub rRTN, rWORD1, rWORD2
694 /* At this point we know the strings have different alignment and the
695 compare length is at least 8 bytes. r12 contains the low order
696 3 bits of rSTR1 and cr5 contains the result of the logical compare
697 of r12 to 0. If r12 == 0 then rStr1 is double word
698 aligned and can perform the DWunaligned loop.
700 Otherwise we know that rSTR1 is not already DW aligned yet.
701 So we can force the string addresses to the next lower DW
702 boundary and special case this first DW using shift left to
703 eliminate bits preceding the first byte. Since we want to join the
704 normal (DWaligned) compare loop, starting at the second double word,
705 we need to adjust the length (rN) and special case the loop
706 versioning for the first DW. This ensures that the loop count is
707 correct and the first DW (shifted) is in the expected resister pair. */
708 #define rSHL r29 /* Unaligned shift left count. */
709 #define rSHR r28 /* Unaligned shift right count. */
710 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
711 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
712 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
713 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
716 cfi_offset(rSHL, -24)
717 clrldi rSHL, rSTR2, 61
718 beq- cr6, L(duzeroLength)
720 cfi_offset(rSHR, -32)
721 beq cr5, L(DWunaligned)
722 std rWORD8_SHIFT, -40(r1)
723 cfi_offset(rWORD8_SHIFT, -40)
724 /* Adjust the logical start of rSTR2 to compensate for the extra bits
725 in the 1st rSTR1 DW. */
726 sub rWORD8_SHIFT, rSTR2, r12
727 /* But do not attempt to address the DW before that DW that contains
728 the actual start of rSTR2. */
729 clrrdi rSTR2, rSTR2, 3
730 std rWORD2_SHIFT, -48(r1)
731 /* Compute the left/right shift counts for the unaligned rSTR2,
732 compensating for the logical (DW aligned) start of rSTR1. */
733 clrldi rSHL, rWORD8_SHIFT, 61
734 clrrdi rSTR1, rSTR1, 3
735 std rWORD4_SHIFT, -56(r1)
737 cmpld cr5, rWORD8_SHIFT, rSTR2
740 std rWORD6_SHIFT, -64(r1)
741 cfi_offset(rWORD2_SHIFT, -48)
742 cfi_offset(rWORD4_SHIFT, -56)
743 cfi_offset(rWORD6_SHIFT, -64)
744 subfic rSHR, rSHL, 64
745 srdi r0, rN, 5 /* Divide by 32 */
746 andi. r12, rN, 24 /* Get the DW remainder */
747 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
748 this special case those bits may be discarded anyway. Also we
749 must avoid loading a DW where none of the bits are part of rSTR2 as
750 this may cross a page boundary and cause a page fault. */
753 #ifdef __LITTLE_ENDIAN__
754 ldbrx rWORD8, 0, rSTR2
760 sld rWORD8, rWORD8, rSHL
763 #ifdef __LITTLE_ENDIAN__
764 ldbrx rWORD1, 0, rSTR1
765 ldbrx rWORD2, 0, rSTR2
774 srd r12, rWORD2, rSHR
777 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
778 or rWORD8, r12, rWORD8
785 sld rWORD8_SHIFT, rWORD2, rSHL
786 sld rWORD7, rWORD1, rWORD6
787 sld rWORD8, rWORD8, rWORD6
789 /* At this point we exit early with the first double word compare
790 complete and remainder of 0 to 7 bytes. See L(du14) for details on
791 how we handle the remaining bytes. */
792 cmpld cr5, rWORD7, rWORD8
799 #ifdef __LITTLE_ENDIAN__
800 ldbrx rWORD2, 0, rSTR2
807 /* Remainder is 16 */
810 sld rWORD6_SHIFT, rWORD2, rSHL
811 sld rWORD5, rWORD1, rWORD6
812 sld rWORD6, rWORD8, rWORD6
814 /* Remainder is 24 */
817 sld rWORD4_SHIFT, rWORD2, rSHL
818 sld rWORD3, rWORD1, rWORD6
819 sld rWORD4, rWORD8, rWORD6
821 /* Count is a multiple of 32, remainder is 0 */
824 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
825 or rWORD8, r12, rWORD8
826 sld rWORD2_SHIFT, rWORD2, rSHL
827 sld rWORD1, rWORD1, rWORD6
828 sld rWORD2, rWORD8, rWORD6
831 /* At this point we know rSTR1 is double word aligned and the
832 compare length is at least 8 bytes. */
835 std rWORD8_SHIFT, -40(r1)
836 clrrdi rSTR2, rSTR2, 3
837 std rWORD2_SHIFT, -48(r1)
838 srdi r0, rN, 5 /* Divide by 32 */
839 std rWORD4_SHIFT, -56(r1)
840 andi. r12, rN, 24 /* Get the DW remainder */
841 std rWORD6_SHIFT, -64(r1)
842 cfi_offset(rWORD8_SHIFT, -40)
843 cfi_offset(rWORD2_SHIFT, -48)
844 cfi_offset(rWORD4_SHIFT, -56)
845 cfi_offset(rWORD6_SHIFT, -64)
847 #ifdef __LITTLE_ENDIAN__
848 ldbrx rWORD6, 0, rSTR2
850 ldbrx rWORD8, 0, rSTR2
859 subfic rSHR, rSHL, 64
860 sld rWORD6_SHIFT, rWORD6, rSHL
862 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
869 srd r12, rWORD8, rSHR
870 #ifdef __LITTLE_ENDIAN__
871 ldbrx rWORD7, 0, rSTR1
876 sld rWORD8_SHIFT, rWORD8, rSHL
877 or rWORD8, r12, rWORD6_SHIFT
880 #ifdef __LITTLE_ENDIAN__
881 ldbrx rWORD1, 0, rSTR1
882 ldbrx rWORD2, 0, rSTR2
889 cmpld cr5, rWORD7, rWORD8
891 sld rWORD2_SHIFT, rWORD2, rSHL
892 or rWORD2, r0, rWORD8_SHIFT
893 #ifdef __LITTLE_ENDIAN__
894 ldbrx rWORD3, 0, rSTR1
895 ldbrx rWORD4, 0, rSTR2
902 cmpld cr7, rWORD1, rWORD2
903 srd r12, rWORD4, rSHR
904 sld rWORD4_SHIFT, rWORD4, rSHL
906 or rWORD4, r12, rWORD2_SHIFT
907 #ifdef __LITTLE_ENDIAN__
908 ldbrx rWORD5, 0, rSTR1
909 ldbrx rWORD6, 0, rSTR2
916 cmpld cr1, rWORD3, rWORD4
918 sld rWORD6_SHIFT, rWORD6, rSHL
920 or rWORD6, r0, rWORD4_SHIFT
921 cmpld cr6, rWORD5, rWORD6
924 /* At this point we exit early with the first double word compare
925 complete and remainder of 0 to 7 bytes. See L(du14) for details on
926 how we handle the remaining bytes. */
928 cmpld cr5, rWORD7, rWORD8
935 #ifdef __LITTLE_ENDIAN__
936 ldbrx rWORD2, 0, rSTR2
943 /* Remainder is 16 */
947 #ifdef __LITTLE_ENDIAN__
948 ldbrx rWORD5, 0, rSTR1
953 or rWORD6, r0, rWORD6_SHIFT
954 sld rWORD6_SHIFT, rWORD8, rSHL
956 #ifdef __LITTLE_ENDIAN__
957 ldbrx rWORD7, 0, rSTR1
958 ldbrx rWORD8, 0, rSTR2
965 cmpld cr6, rWORD5, rWORD6
966 srd r12, rWORD8, rSHR
967 sld rWORD8_SHIFT, rWORD8, rSHL
968 or rWORD8, r12, rWORD6_SHIFT
970 #ifdef __LITTLE_ENDIAN__
971 ldbrx rWORD1, 0, rSTR1
972 ldbrx rWORD2, 0, rSTR2
979 cmpld cr5, rWORD7, rWORD8
982 sld rWORD2_SHIFT, rWORD2, rSHL
983 or rWORD2, r0, rWORD8_SHIFT
984 #ifdef __LITTLE_ENDIAN__
985 ldbrx rWORD3, 0, rSTR1
986 ldbrx rWORD4, 0, rSTR2
993 cmpld cr7, rWORD1, rWORD2
995 srd r12, rWORD4, rSHR
996 sld rWORD4_SHIFT, rWORD4, rSHL
997 or rWORD4, r12, rWORD2_SHIFT
998 #ifndef __LITTLE_ENDIAN__
1000 addi rSTR2, rSTR2, 8
1002 cmpld cr1, rWORD3, rWORD4
1006 cmpld cr5, rWORD7, rWORD8
1007 #ifndef __LITTLE_ENDIAN__
1008 addi rSTR1, rSTR1, 8
1009 addi rSTR2, rSTR2, 8
1018 #ifdef __LITTLE_ENDIAN__
1019 ldbrx rWORD2, 0, rSTR2
1020 addi rSTR2, rSTR2, 8
1024 srd r0, rWORD2, rSHR
1027 /* Remainder is 24 */
1030 srd r12, rWORD8, rSHR
1031 #ifdef __LITTLE_ENDIAN__
1032 ldbrx rWORD3, 0, rSTR1
1033 addi rSTR1, rSTR1, 8
1037 sld rWORD4_SHIFT, rWORD8, rSHL
1038 or rWORD4, r12, rWORD6_SHIFT
1040 #ifdef __LITTLE_ENDIAN__
1041 ldbrx rWORD5, 0, rSTR1
1042 ldbrx rWORD6, 0, rSTR2
1043 addi rSTR1, rSTR1, 8
1044 addi rSTR2, rSTR2, 8
1049 cmpld cr1, rWORD3, rWORD4
1050 srd r0, rWORD6, rSHR
1051 sld rWORD6_SHIFT, rWORD6, rSHL
1052 or rWORD6, r0, rWORD4_SHIFT
1053 #ifdef __LITTLE_ENDIAN__
1054 ldbrx rWORD7, 0, rSTR1
1055 ldbrx rWORD8, 0, rSTR2
1056 addi rSTR1, rSTR1, 8
1057 addi rSTR2, rSTR2, 8
1059 ld rWORD7, 16(rSTR1)
1060 ld rWORD8, 16(rSTR2)
1062 cmpld cr6, rWORD5, rWORD6
1064 srd r12, rWORD8, rSHR
1065 sld rWORD8_SHIFT, rWORD8, rSHL
1066 or rWORD8, r12, rWORD6_SHIFT
1068 #ifdef __LITTLE_ENDIAN__
1069 ldbrx rWORD1, 0, rSTR1
1070 ldbrx rWORD2, 0, rSTR2
1071 addi rSTR1, rSTR1, 8
1072 addi rSTR2, rSTR2, 8
1074 ld rWORD1, 24(rSTR1)
1075 ld rWORD2, 24(rSTR2)
1077 cmpld cr5, rWORD7, rWORD8
1079 srd r0, rWORD2, rSHR
1080 sld rWORD2_SHIFT, rWORD2, rSHL
1081 or rWORD2, r0, rWORD8_SHIFT
1082 #ifndef __LITTLE_ENDIAN__
1083 addi rSTR1, rSTR1, 16
1084 addi rSTR2, rSTR2, 16
1086 cmpld cr7, rWORD1, rWORD2
1090 #ifndef __LITTLE_ENDIAN__
1091 addi rSTR1, rSTR1, 16
1092 addi rSTR2, rSTR2, 16
1095 /* Huh? We've already branched on cr1! */
1098 cmpld cr5, rWORD7, rWORD8
1106 #ifdef __LITTLE_ENDIAN__
1107 ldbrx rWORD2, 0, rSTR2
1108 addi rSTR2, rSTR2, 8
1112 srd r0, rWORD2, rSHR
1115 /* Count is a multiple of 32, remainder is 0 */
1118 mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
1119 srd r0, rWORD8, rSHR
1120 #ifdef __LITTLE_ENDIAN__
1121 ldbrx rWORD1, 0, rSTR1
1122 addi rSTR1, rSTR1, 8
1126 sld rWORD2_SHIFT, rWORD8, rSHL
1127 or rWORD2, r0, rWORD6_SHIFT
1129 #ifdef __LITTLE_ENDIAN__
1130 ldbrx rWORD3, 0, rSTR1
1131 ldbrx rWORD4, 0, rSTR2
1132 addi rSTR1, rSTR1, 8
1133 addi rSTR2, rSTR2, 8
1138 cmpld cr7, rWORD1, rWORD2
1139 srd r12, rWORD4, rSHR
1140 sld rWORD4_SHIFT, rWORD4, rSHL
1141 or rWORD4, r12, rWORD2_SHIFT
1142 #ifdef __LITTLE_ENDIAN__
1143 ldbrx rWORD5, 0, rSTR1
1144 ldbrx rWORD6, 0, rSTR2
1145 addi rSTR1, rSTR1, 8
1146 addi rSTR2, rSTR2, 8
1148 ld rWORD5, 16(rSTR1)
1149 ld rWORD6, 16(rSTR2)
1151 cmpld cr1, rWORD3, rWORD4
1153 srd r0, rWORD6, rSHR
1154 sld rWORD6_SHIFT, rWORD6, rSHL
1155 or rWORD6, r0, rWORD4_SHIFT
1156 #ifdef __LITTLE_ENDIAN__
1157 ldbrx rWORD7, 0, rSTR1
1158 ldbrx rWORD8, 0, rSTR2
1159 addi rSTR1, rSTR1, 8
1160 addi rSTR2, rSTR2, 8
1162 ldu rWORD7, 24(rSTR1)
1163 ldu rWORD8, 24(rSTR2)
1165 cmpld cr6, rWORD5, rWORD6
1167 srd r12, rWORD8, rSHR
1168 sld rWORD8_SHIFT, rWORD8, rSHL
1169 or rWORD8, r12, rWORD6_SHIFT
1170 cmpld cr5, rWORD7, rWORD8
1171 bdz- L(du24) /* Adjust CTR as we start with +4 */
1172 /* This is the primary loop */
1175 #ifdef __LITTLE_ENDIAN__
1176 ldbrx rWORD1, 0, rSTR1
1177 ldbrx rWORD2, 0, rSTR2
1178 addi rSTR1, rSTR1, 8
1179 addi rSTR2, rSTR2, 8
1184 cmpld cr1, rWORD3, rWORD4
1186 srd r0, rWORD2, rSHR
1187 sld rWORD2_SHIFT, rWORD2, rSHL
1188 or rWORD2, r0, rWORD8_SHIFT
1190 #ifdef __LITTLE_ENDIAN__
1191 ldbrx rWORD3, 0, rSTR1
1192 ldbrx rWORD4, 0, rSTR2
1193 addi rSTR1, rSTR1, 8
1194 addi rSTR2, rSTR2, 8
1196 ld rWORD3, 16(rSTR1)
1197 ld rWORD4, 16(rSTR2)
1199 cmpld cr6, rWORD5, rWORD6
1201 srd r12, rWORD4, rSHR
1202 sld rWORD4_SHIFT, rWORD4, rSHL
1203 or rWORD4, r12, rWORD2_SHIFT
1205 #ifdef __LITTLE_ENDIAN__
1206 ldbrx rWORD5, 0, rSTR1
1207 ldbrx rWORD6, 0, rSTR2
1208 addi rSTR1, rSTR1, 8
1209 addi rSTR2, rSTR2, 8
1211 ld rWORD5, 24(rSTR1)
1212 ld rWORD6, 24(rSTR2)
1214 cmpld cr5, rWORD7, rWORD8
1216 srd r0, rWORD6, rSHR
1217 sld rWORD6_SHIFT, rWORD6, rSHL
1218 or rWORD6, r0, rWORD4_SHIFT
1220 #ifdef __LITTLE_ENDIAN__
1221 ldbrx rWORD7, 0, rSTR1
1222 ldbrx rWORD8, 0, rSTR2
1223 addi rSTR1, rSTR1, 8
1224 addi rSTR2, rSTR2, 8
1226 ldu rWORD7, 32(rSTR1)
1227 ldu rWORD8, 32(rSTR2)
1229 cmpld cr7, rWORD1, rWORD2
1231 srd r12, rWORD8, rSHR
1232 sld rWORD8_SHIFT, rWORD8, rSHL
1233 or rWORD8, r12, rWORD6_SHIFT
1238 /* Huh? We've already branched on cr1! */
1241 cmpld cr1, rWORD3, rWORD4
1243 cmpld cr6, rWORD5, rWORD6
1245 cmpld cr5, rWORD7, rWORD8
1255 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
1256 shift right double to eliminate bits beyond the compare length.
1258 However it may not be safe to load rWORD2 which may be beyond the
1259 string length. So we compare the bit length of the remainder to
1260 the right shift count (rSHR). If the bit count is less than or equal
1261 we do not need to load rWORD2 (all significant bits are already in
1267 #ifdef __LITTLE_ENDIAN__
1268 ldbrx rWORD2, 0, rSTR2
1269 addi rSTR2, rSTR2, 8
1273 srd r0, rWORD2, rSHR
1276 #ifdef __LITTLE_ENDIAN__
1277 ldbrx rWORD1, 0, rSTR1
1282 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
1283 or rWORD2, r0, rWORD8_SHIFT
1286 srd rWORD1, rWORD1, rN
1287 srd rWORD2, rWORD2, rN
1289 ld rWORD8_SHIFT, -40(r1)
1291 cmpld cr7, rWORD1, rWORD2
1292 ld rWORD2_SHIFT, -48(r1)
1293 ld rWORD4_SHIFT, -56(r1)
1294 beq cr7, L(dureturn24)
1296 ld rWORD6_SHIFT, -64(r1)
1305 bgt cr7, L(dureturn29)
1315 bgt cr1, L(dureturn29)
1325 bgt cr6, L(dureturn29)
1335 bgt cr5, L(dureturn29)
1351 ld rWORD8_SHIFT, -40(r1)
1353 ld rWORD2_SHIFT, -48(r1)
1355 ld rWORD4_SHIFT, -56(r1)
1357 ld rWORD6_SHIFT, -64(r1)
1364 libc_hidden_builtin_def (memcmp)
1365 weak_alias (memcmp, bcmp)