1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
21 /* int [r3] memcmp (const char *s1 [r3],
25 # define MEMCMP memcmp
28 ENTRY_TOCLESS (MEMCMP, 4)
32 #define rSTR1 r3 /* first string arg */
33 #define rSTR2 r4 /* second string arg */
34 #define rN r5 /* max string length */
35 #define rWORD1 r6 /* current word in s1 */
36 #define rWORD2 r7 /* current word in s2 */
37 #define rWORD3 r8 /* next word in s1 */
38 #define rWORD4 r9 /* next word in s2 */
39 #define rWORD5 r10 /* next word in s1 */
40 #define rWORD6 r11 /* next word in s2 */
42 #define rOFF8 r20 /* 8 bytes offset. */
43 #define rOFF16 r21 /* 16 bytes offset. */
44 #define rOFF24 r22 /* 24 bytes offset. */
45 #define rOFF32 r23 /* 24 bytes offset. */
46 #define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
47 #define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
48 #define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
49 #define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
50 #define rSHR r28 /* Unaligned shift right count. */
51 #define rSHL r29 /* Unaligned shift left count. */
52 #define rWORD7 r30 /* next word in s1 */
53 #define rWORD8 r31 /* next word in s2 */
55 #define rWORD8SAVE (-8)
56 #define rWORD7SAVE (-16)
57 #define rOFF8SAVE (-24)
58 #define rOFF16SAVE (-32)
59 #define rOFF24SAVE (-40)
60 #define rOFF32SAVE (-48)
61 #define rSHRSAVE (-56)
62 #define rSHLSAVE (-64)
63 #define rWORD8SHIFTSAVE (-72)
64 #define rWORD2SHIFTSAVE (-80)
65 #define rWORD4SHIFTSAVE (-88)
66 #define rWORD6SHIFTSAVE (-96)
68 #ifdef __LITTLE_ENDIAN__
80 beq- cr6, L(zeroLength)
83 /* If less than 8 bytes or not aligned, use the unaligned
85 blt cr1, L(bytealigned)
86 std rWORD8, rWORD8SAVE(r1)
87 std rWORD7, rWORD7SAVE(r1)
88 std rOFF8, rOFF8SAVE(r1)
89 std rOFF16, rOFF16SAVE(r1)
90 std rOFF24, rOFF24SAVE(r1)
91 std rOFF32, rOFF32SAVE(r1)
92 cfi_offset(rWORD8, rWORD8SAVE)
93 cfi_offset(rWORD7, rWORD7SAVE)
94 cfi_offset(rOFF8, rOFF8SAVE)
95 cfi_offset(rOFF16, rOFF16SAVE)
96 cfi_offset(rOFF24, rOFF24SAVE)
97 cfi_offset(rOFF32, rOFF32SAVE)
105 /* At this point we know both strings have the same alignment and the
106 compare length is at least 8 bytes. r12 contains the low order
107 3 bits of rSTR1 and cr5 contains the result of the logical compare
108 of r12 to 0. If r12 == 0 then we are already double word
109 aligned and can perform the DW aligned loop.
111 Otherwise we know the two strings have the same alignment (but not
112 yet DW). So we force the string addresses to the next lower DW
113 boundary and special case this first DW using shift left to
114 eliminate bits preceding the first byte. Since we want to join the
115 normal (DW aligned) compare loop, starting at the second double word,
116 we need to adjust the length (rN) and special case the loop
117 versioning for the first DW. This ensures that the loop count is
118 correct and the first DW (shifted) is in the expected register pair. */
121 clrrdi rSTR1, rSTR1, 3
122 clrrdi rSTR2, rSTR2, 3
123 beq cr5, L(DWaligned)
126 srdi r0, rN, 5 /* Divide by 32 */
127 andi. r12, rN, 24 /* Get the DW remainder */
141 sld rWORD5, rWORD1, rWORD6
142 sld rWORD6, rWORD2, rWORD6
143 cmpld cr5, rWORD5, rWORD6
145 /* Do something useful in this cycle since we have to branch anyway. */
146 LD rWORD1, rOFF8, rSTR1
147 LD rWORD2, rOFF8, rSTR2
148 cmpld cr7, rWORD1, rWORD2
150 /* Remainder is 16 */
153 sld rWORD5, rWORD1, rWORD6
154 sld rWORD6, rWORD2, rWORD6
155 cmpld cr6, rWORD5, rWORD6
157 /* Do something useful in this cycle since we have to branch anyway. */
158 LD rWORD7, rOFF8, rSTR1
159 LD rWORD8, rOFF8, rSTR2
160 cmpld cr5, rWORD7, rWORD8
162 /* Remainder is 24 */
165 sld rWORD3, rWORD1, rWORD6
166 sld rWORD4, rWORD2, rWORD6
167 cmpld cr1, rWORD3, rWORD4
169 /* Count is a multiple of 32, remainder is 0 */
173 sld rWORD1, rWORD1, rWORD6
174 sld rWORD2, rWORD2, rWORD6
175 cmpld cr7, rWORD1, rWORD2
178 /* At this point we know both strings are double word aligned and the
179 compare length is at least 8 bytes. */
182 andi. r12, rN, 24 /* Get the DW remainder */
183 srdi r0, rN, 5 /* Divide by 32 */
195 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
196 (8-15 byte compare), we want to use only volatile registers. This
197 means we can avoid restoring non-volatile registers since we did not
198 change any on the early exit path. The key here is the non-early
199 exit path only cares about the condition code (cr5), not about which
200 register pair was used. */
203 cmpld cr5, rWORD5, rWORD6
205 LD rWORD1, rOFF8, rSTR1
206 LD rWORD2, rOFF8, rSTR2
207 cmpld cr7, rWORD1, rWORD2
209 LD rWORD3, rOFF16, rSTR1
210 LD rWORD4, rOFF16, rSTR2
211 cmpld cr1, rWORD3, rWORD4
212 LD rWORD5, rOFF24, rSTR1
213 LD rWORD6, rOFF24, rSTR2
214 cmpld cr6, rWORD5, rWORD6
218 LD rWORD7, rOFF32, rSTR1
219 LD rWORD8, rOFF32, rSTR2
220 addi rSTR1, rSTR1, 32
221 addi rSTR2, rSTR2, 32
223 cmpld cr5, rWORD7, rWORD8
226 ld rWORD8, rWORD8SAVE(r1)
227 ld rWORD7, rWORD7SAVE(r1)
232 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
234 ld rOFF8, rOFF8SAVE(r1)
235 ld rOFF16, rOFF16SAVE(r1)
236 ld rOFF24, rOFF24SAVE(r1)
237 ld rOFF32, rOFF32SAVE(r1)
241 /* Remainder is 16 */
247 cmpld cr6, rWORD5, rWORD6
249 LD rWORD7, rOFF8, rSTR1
250 LD rWORD8, rOFF8, rSTR2
251 cmpld cr5, rWORD7, rWORD8
253 LD rWORD1, rOFF16, rSTR1
254 LD rWORD2, rOFF16, rSTR2
255 cmpld cr7, rWORD1, rWORD2
256 LD rWORD3, rOFF24, rSTR1
257 LD rWORD4, rOFF24, rSTR2
258 cmpld cr1, rWORD3, rWORD4
266 LD rWORD3, rOFF8, rSTR1
267 LD rWORD4, rOFF8, rSTR2
268 cmpld cr1, rWORD3, rWORD4
274 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
276 ld rOFF8, rOFF8SAVE(r1)
277 ld rOFF16, rOFF16SAVE(r1)
278 ld rOFF24, rOFF24SAVE(r1)
279 ld rOFF32, rOFF32SAVE(r1)
283 /* Remainder is 24 */
289 cmpld cr1, rWORD3, rWORD4
291 LD rWORD5, rOFF8, rSTR1
292 LD rWORD6, rOFF8, rSTR2
293 cmpld cr6, rWORD5, rWORD6
295 LD rWORD7, rOFF16, rSTR1
296 LD rWORD8, rOFF16, rSTR2
297 cmpld cr5, rWORD7, rWORD8
298 LD rWORD1, rOFF24, rSTR1
299 LD rWORD2, rOFF24, rSTR2
300 cmpld cr7, rWORD1, rWORD2
301 addi rSTR1, rSTR1, 16
302 addi rSTR2, rSTR2, 16
306 /* Again we are on a early exit path (24-31 byte compare), we want to
307 only use volatile registers and avoid restoring non-volatile
311 LD rWORD1, rOFF16, rSTR1
312 LD rWORD2, rOFF16, rSTR2
313 cmpld cr7, rWORD1, rWORD2
316 addi rSTR1, rSTR1, 16
317 addi rSTR2, rSTR2, 16
319 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
322 ld rOFF8, rOFF8SAVE(r1)
323 ld rOFF16, rOFF16SAVE(r1)
324 ld rOFF24, rOFF24SAVE(r1)
325 ld rOFF32, rOFF32SAVE(r1)
329 /* Count is a multiple of 32, remainder is 0 */
335 cmpld cr7, rWORD1, rWORD2
337 LD rWORD3, rOFF8, rSTR1
338 LD rWORD4, rOFF8, rSTR2
339 cmpld cr1, rWORD3, rWORD4
340 LD rWORD5, rOFF16, rSTR1
341 LD rWORD6, rOFF16, rSTR2
342 cmpld cr6, rWORD5, rWORD6
343 LD rWORD7, rOFF24, rSTR1
344 LD rWORD8, rOFF24, rSTR2
345 addi rSTR1, rSTR1, 24
346 addi rSTR2, rSTR2, 24
347 cmpld cr5, rWORD7, rWORD8
350 bdz- L(d24) /* Adjust CTR as we start with +4 */
351 /* This is the primary loop */
354 LD rWORD1, rOFF8, rSTR1
355 LD rWORD2, rOFF8, rSTR2
356 cmpld cr1, rWORD3, rWORD4
359 LD rWORD3, rOFF16, rSTR1
360 LD rWORD4, rOFF16, rSTR2
361 cmpld cr6, rWORD5, rWORD6
364 LD rWORD5, rOFF24, rSTR1
365 LD rWORD6, rOFF24, rSTR2
366 cmpld cr5, rWORD7, rWORD8
369 LD rWORD7, rOFF32, rSTR1
370 LD rWORD8, rOFF32, rSTR2
371 addi rSTR1, rSTR1, 32
372 addi rSTR2, rSTR2, 32
374 cmpld cr7, rWORD1, rWORD2
378 cmpld cr1, rWORD3, rWORD4
380 cmpld cr6, rWORD5, rWORD6
382 cmpld cr5, rWORD7, rWORD8
393 ld rWORD8, rWORD8SAVE(r1)
394 ld rWORD7, rWORD7SAVE(r1)
395 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
397 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
398 we are aligned it is safe to load the whole double word, and use
399 shift right double to eliminate bits beyond the compare length. */
401 LD rWORD1, rOFF8, rSTR1
402 LD rWORD2, rOFF8, rSTR2
403 srd rWORD1, rWORD1, rN
404 srd rWORD2, rWORD2, rN
405 cmpld cr7, rWORD1, rWORD2
407 ld rOFF8, rOFF8SAVE(r1)
408 ld rOFF16, rOFF16SAVE(r1)
409 ld rOFF24, rOFF24SAVE(r1)
410 ld rOFF32, rOFF32SAVE(r1)
416 ld rWORD8, rWORD8SAVE(r1)
417 ld rWORD7, rWORD7SAVE(r1)
419 ld rOFF8, rOFF8SAVE(r1)
420 ld rOFF16, rOFF16SAVE(r1)
421 ld rOFF24, rOFF24SAVE(r1)
422 ld rOFF32, rOFF32SAVE(r1)
429 ld rWORD8, rWORD8SAVE(r1)
430 ld rWORD7, rWORD7SAVE(r1)
432 ld rOFF8, rOFF8SAVE(r1)
433 ld rOFF16, rOFF16SAVE(r1)
434 ld rOFF24, rOFF24SAVE(r1)
435 ld rOFF32, rOFF32SAVE(r1)
442 ld rWORD8, rWORD8SAVE(r1)
443 ld rWORD7, rWORD7SAVE(r1)
445 ld rOFF8, rOFF8SAVE(r1)
446 ld rOFF16, rOFF16SAVE(r1)
447 ld rOFF24, rOFF24SAVE(r1)
448 ld rOFF32, rOFF32SAVE(r1)
455 ld rWORD8, rWORD8SAVE(r1)
456 ld rWORD7, rWORD7SAVE(r1)
458 ld rOFF8, rOFF8SAVE(r1)
459 ld rOFF16, rOFF16SAVE(r1)
460 ld rOFF24, rOFF24SAVE(r1)
461 ld rOFF32, rOFF32SAVE(r1)
471 /* We need to prime this loop. This loop is swing modulo scheduled
472 to avoid pipe delays. The dependent instruction latencies (load to
473 compare to conditional branch) is 2 to 3 cycles. In this loop each
474 dispatch group ends in a branch and takes 1 cycle. Effectively
475 the first iteration of the loop only serves to load operands and
476 branches based on compares are delayed until the next loop.
478 So we must precondition some registers and condition codes so that
479 we don't exit the loop early on the first iteration. */
484 cmpld cr7, rWORD1, rWORD2
488 cmpld cr1, rWORD3, rWORD4
489 lbzu rWORD5, 2(rSTR1)
490 lbzu rWORD6, 2(rSTR2)
494 lbzu rWORD1, 1(rSTR1)
495 lbzu rWORD2, 1(rSTR2)
498 cmpld cr6, rWORD5, rWORD6
501 lbzu rWORD3, 1(rSTR1)
502 lbzu rWORD4, 1(rSTR2)
505 cmpld cr7, rWORD1, rWORD2
508 lbzu rWORD5, 1(rSTR1)
509 lbzu rWORD6, 1(rSTR2)
512 cmpld cr1, rWORD3, rWORD4
515 /* We speculatively loading bytes before we have tested the previous
516 bytes. But we must avoid overrunning the length (in the ctr) to
517 prevent these speculative loads from causing a segfault. In this
518 case the loop will exit early (before the all pending bytes are
519 tested. In this case we must complete the pending operations
556 sub rRTN, rWORD5, rWORD6
562 sub rRTN, rWORD3, rWORD4
566 sub rRTN, rWORD1, rWORD2
575 /* At this point we know the strings have different alignment and the
576 compare length is at least 8 bytes. r12 contains the low order
577 3 bits of rSTR1 and cr5 contains the result of the logical compare
578 of r12 to 0. If r12 == 0 then rStr1 is double word
579 aligned and can perform the DWunaligned loop.
581 Otherwise we know that rSTR1 is not already DW aligned yet.
582 So we can force the string addresses to the next lower DW
583 boundary and special case this first DW using shift left to
584 eliminate bits preceding the first byte. Since we want to join the
585 normal (DWaligned) compare loop, starting at the second double word,
586 we need to adjust the length (rN) and special case the loop
587 versioning for the first DW. This ensures that the loop count is
588 correct and the first DW (shifted) is in the expected resister pair. */
590 std rSHL, rSHLSAVE(r1)
591 cfi_offset(rSHL, rSHLSAVE)
592 clrldi rSHL, rSTR2, 61
593 beq cr6, L(duzeroLength)
594 std rSHR, rSHRSAVE(r1)
595 cfi_offset(rSHR, rSHRSAVE)
596 beq cr5, L(DWunaligned)
597 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
598 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
599 /* Adjust the logical start of rSTR2 to compensate for the extra bits
600 in the 1st rSTR1 DW. */
601 sub rWORD8_SHIFT, rSTR2, r12
602 /* But do not attempt to address the DW before that DW that contains
603 the actual start of rSTR2. */
604 clrrdi rSTR2, rSTR2, 3
605 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
606 /* Compute the left/right shift counts for the unaligned rSTR2,
607 compensating for the logical (DW aligned) start of rSTR1. */
608 clrldi rSHL, rWORD8_SHIFT, 61
609 clrrdi rSTR1, rSTR1, 3
610 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
612 cmpld cr5, rWORD8_SHIFT, rSTR2
615 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
616 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
617 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
618 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
619 subfic rSHR, rSHL, 64
620 srdi r0, rN, 5 /* Divide by 32 */
621 andi. r12, rN, 24 /* Get the DW remainder */
622 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
623 this special case those bits may be discarded anyway. Also we
624 must avoid loading a DW where none of the bits are part of rSTR2 as
625 this may cross a page boundary and cause a page fault. */
630 sld rWORD8, rWORD8, rSHL
637 srd r12, rWORD2, rSHR
641 or rWORD8, r12, rWORD8
648 sld rWORD8_SHIFT, rWORD2, rSHL
649 sld rWORD7, rWORD1, rWORD6
650 sld rWORD8, rWORD8, rWORD6
652 /* At this point we exit early with the first double word compare
653 complete and remainder of 0 to 7 bytes. See L(du14) for details on
654 how we handle the remaining bytes. */
655 cmpld cr5, rWORD7, rWORD8
662 LD rWORD2, rOFF8, rSTR2
665 /* Remainder is 16 */
668 sld rWORD6_SHIFT, rWORD2, rSHL
669 sld rWORD5, rWORD1, rWORD6
670 sld rWORD6, rWORD8, rWORD6
672 /* Remainder is 24 */
675 sld rWORD4_SHIFT, rWORD2, rSHL
676 sld rWORD3, rWORD1, rWORD6
677 sld rWORD4, rWORD8, rWORD6
679 /* Count is a multiple of 32, remainder is 0 */
683 or rWORD8, r12, rWORD8
684 sld rWORD2_SHIFT, rWORD2, rSHL
685 sld rWORD1, rWORD1, rWORD6
686 sld rWORD2, rWORD8, rWORD6
689 /* At this point we know rSTR1 is double word aligned and the
690 compare length is at least 8 bytes. */
693 std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
694 clrrdi rSTR2, rSTR2, 3
695 std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
696 srdi r0, rN, 5 /* Divide by 32 */
697 std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
698 andi. r12, rN, 24 /* Get the DW remainder */
699 std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
700 cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
701 cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
702 cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
703 cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
706 LD rWORD8, rOFF8, rSTR2
711 subfic rSHR, rSHL, 64
712 sld rWORD6_SHIFT, rWORD6, rSHL
721 srd r12, rWORD8, rSHR
723 sld rWORD8_SHIFT, rWORD8, rSHL
724 or rWORD8, r12, rWORD6_SHIFT
727 LD rWORD1, rOFF8, rSTR1
728 LD rWORD2, rOFF8, rSTR2
729 cmpld cr5, rWORD7, rWORD8
731 sld rWORD2_SHIFT, rWORD2, rSHL
732 or rWORD2, r0, rWORD8_SHIFT
733 LD rWORD3, rOFF16, rSTR1
734 LD rWORD4, rOFF16, rSTR2
735 cmpld cr7, rWORD1, rWORD2
736 srd r12, rWORD4, rSHR
737 sld rWORD4_SHIFT, rWORD4, rSHL
739 or rWORD4, r12, rWORD2_SHIFT
740 LD rWORD5, rOFF24, rSTR1
741 LD rWORD6, rOFF24, rSTR2
742 cmpld cr1, rWORD3, rWORD4
744 sld rWORD6_SHIFT, rWORD6, rSHL
746 or rWORD6, r0, rWORD4_SHIFT
747 cmpld cr6, rWORD5, rWORD6
750 /* At this point we exit early with the first double word compare
751 complete and remainder of 0 to 7 bytes. See L(du14) for details on
752 how we handle the remaining bytes. */
754 cmpld cr5, rWORD7, rWORD8
761 LD rWORD2, rOFF8, rSTR2
764 /* Remainder is 16 */
769 or rWORD6, r0, rWORD6_SHIFT
770 sld rWORD6_SHIFT, rWORD8, rSHL
772 LD rWORD7, rOFF8, rSTR1
773 LD rWORD8, rOFF8, rSTR2
774 cmpld cr6, rWORD5, rWORD6
775 srd r12, rWORD8, rSHR
776 sld rWORD8_SHIFT, rWORD8, rSHL
777 or rWORD8, r12, rWORD6_SHIFT
779 LD rWORD1, rOFF16, rSTR1
780 LD rWORD2, rOFF16, rSTR2
781 cmpld cr5, rWORD7, rWORD8
784 sld rWORD2_SHIFT, rWORD2, rSHL
785 or rWORD2, r0, rWORD8_SHIFT
786 LD rWORD3, rOFF24, rSTR1
787 LD rWORD4, rOFF24, rSTR2
788 cmpld cr7, rWORD1, rWORD2
790 srd r12, rWORD4, rSHR
791 sld rWORD4_SHIFT, rWORD4, rSHL
792 or rWORD4, r12, rWORD2_SHIFT
795 cmpld cr1, rWORD3, rWORD4
799 cmpld cr5, rWORD7, rWORD8
809 LD rWORD2, rOFF8, rSTR2
813 /* Remainder is 24 */
816 srd r12, rWORD8, rSHR
818 sld rWORD4_SHIFT, rWORD8, rSHL
819 or rWORD4, r12, rWORD6_SHIFT
821 LD rWORD5, rOFF8, rSTR1
822 LD rWORD6, rOFF8, rSTR2
823 cmpld cr1, rWORD3, rWORD4
825 sld rWORD6_SHIFT, rWORD6, rSHL
826 or rWORD6, r0, rWORD4_SHIFT
827 LD rWORD7, rOFF16, rSTR1
828 LD rWORD8, rOFF16, rSTR2
829 cmpld cr6, rWORD5, rWORD6
831 srd r12, rWORD8, rSHR
832 sld rWORD8_SHIFT, rWORD8, rSHL
833 or rWORD8, r12, rWORD6_SHIFT
835 LD rWORD1, rOFF24, rSTR1
836 LD rWORD2, rOFF24, rSTR2
837 cmpld cr5, rWORD7, rWORD8
840 sld rWORD2_SHIFT, rWORD2, rSHL
841 or rWORD2, r0, rWORD8_SHIFT
842 addi rSTR1, rSTR1, 16
843 addi rSTR2, rSTR2, 16
844 cmpld cr7, rWORD1, rWORD2
848 addi rSTR1, rSTR1, 16
849 addi rSTR2, rSTR2, 16
850 cmpld cr5, rWORD7, rWORD8
858 LD rWORD2, rOFF8, rSTR2
862 /* Count is a multiple of 32, remainder is 0 */
868 sld rWORD2_SHIFT, rWORD8, rSHL
869 or rWORD2, r0, rWORD6_SHIFT
871 LD rWORD3, rOFF8, rSTR1
872 LD rWORD4, rOFF8, rSTR2
873 cmpld cr7, rWORD1, rWORD2
874 srd r12, rWORD4, rSHR
875 sld rWORD4_SHIFT, rWORD4, rSHL
876 or rWORD4, r12, rWORD2_SHIFT
877 LD rWORD5, rOFF16, rSTR1
878 LD rWORD6, rOFF16, rSTR2
879 cmpld cr1, rWORD3, rWORD4
882 sld rWORD6_SHIFT, rWORD6, rSHL
883 or rWORD6, r0, rWORD4_SHIFT
884 LD rWORD7, rOFF24, rSTR1
885 LD rWORD8, rOFF24, rSTR2
886 addi rSTR1, rSTR1, 24
887 addi rSTR2, rSTR2, 24
888 cmpld cr6, rWORD5, rWORD6
890 srd r12, rWORD8, rSHR
891 sld rWORD8_SHIFT, rWORD8, rSHL
892 or rWORD8, r12, rWORD6_SHIFT
893 cmpld cr5, rWORD7, rWORD8
894 bdz L(du24) /* Adjust CTR as we start with +4 */
895 /* This is the primary loop */
898 LD rWORD1, rOFF8, rSTR1
899 LD rWORD2, rOFF8, rSTR2
900 cmpld cr1, rWORD3, rWORD4
903 sld rWORD2_SHIFT, rWORD2, rSHL
904 or rWORD2, r0, rWORD8_SHIFT
906 LD rWORD3, rOFF16, rSTR1
907 LD rWORD4, rOFF16, rSTR2
908 cmpld cr6, rWORD5, rWORD6
910 srd r12, rWORD4, rSHR
911 sld rWORD4_SHIFT, rWORD4, rSHL
912 or rWORD4, r12, rWORD2_SHIFT
914 LD rWORD5, rOFF24, rSTR1
915 LD rWORD6, rOFF24, rSTR2
916 cmpld cr5, rWORD7, rWORD8
919 sld rWORD6_SHIFT, rWORD6, rSHL
920 or rWORD6, r0, rWORD4_SHIFT
922 LD rWORD7, rOFF32, rSTR1
923 LD rWORD8, rOFF32, rSTR2
924 addi rSTR1, rSTR1, 32
925 addi rSTR2, rSTR2, 32
926 cmpld cr7, rWORD1, rWORD2
928 srd r12, rWORD8, rSHR
929 sld rWORD8_SHIFT, rWORD8, rSHL
930 or rWORD8, r12, rWORD6_SHIFT
934 cmpld cr1, rWORD3, rWORD4
936 cmpld cr6, rWORD5, rWORD6
938 cmpld cr5, rWORD7, rWORD8
948 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
949 shift right double to eliminate bits beyond the compare length.
951 However it may not be safe to load rWORD2 which may be beyond the
952 string length. So we compare the bit length of the remainder to
953 the right shift count (rSHR). If the bit count is less than or equal
954 we do not need to load rWORD2 (all significant bits are already in
960 LD rWORD2, rOFF8, rSTR2
964 LD rWORD1, rOFF8, rSTR1
966 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
967 or rWORD2, r0, rWORD8_SHIFT
968 ld rWORD7, rWORD7SAVE(r1)
969 ld rSHL, rSHLSAVE(r1)
970 srd rWORD1, rWORD1, rN
971 srd rWORD2, rWORD2, rN
972 ld rSHR, rSHRSAVE(r1)
973 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
975 cmpld cr7, rWORD1, rWORD2
976 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
977 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
978 beq cr7, L(dureturn24)
980 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
981 ld rOFF8, rOFF8SAVE(r1)
982 ld rOFF16, rOFF16SAVE(r1)
983 ld rOFF24, rOFF24SAVE(r1)
984 ld rOFF32, rOFF32SAVE(r1)
990 ld rWORD8, rWORD8SAVE(r1)
991 ld rWORD7, rWORD7SAVE(r1)
993 bgt cr7, L(dureturn29)
994 ld rSHL, rSHLSAVE(r1)
995 ld rSHR, rSHRSAVE(r1)
1000 ld rWORD8, rWORD8SAVE(r1)
1001 ld rWORD7, rWORD7SAVE(r1)
1003 bgt cr1, L(dureturn29)
1004 ld rSHL, rSHLSAVE(r1)
1005 ld rSHR, rSHRSAVE(r1)
1010 ld rWORD8, rWORD8SAVE(r1)
1011 ld rWORD7, rWORD7SAVE(r1)
1013 bgt cr6, L(dureturn29)
1014 ld rSHL, rSHLSAVE(r1)
1015 ld rSHR, rSHRSAVE(r1)
1020 ld rWORD8, rWORD8SAVE(r1)
1021 ld rWORD7, rWORD7SAVE(r1)
1023 bgt cr5, L(dureturn29)
1024 ld rSHL, rSHLSAVE(r1)
1025 ld rSHR, rSHRSAVE(r1)
1034 ld rWORD8, rWORD8SAVE(r1)
1035 ld rWORD7, rWORD7SAVE(r1)
1037 ld rSHL, rSHLSAVE(r1)
1038 ld rSHR, rSHRSAVE(r1)
1040 ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
1041 ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
1042 ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
1044 ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
1045 ld rOFF8, rOFF8SAVE(r1)
1046 ld rOFF16, rOFF16SAVE(r1)
1047 ld rOFF24, rOFF24SAVE(r1)
1048 ld rOFF32, rOFF32SAVE(r1)
1052 ld rOFF8, rOFF8SAVE(r1)
1053 ld rOFF16, rOFF16SAVE(r1)
1054 ld rOFF24, rOFF24SAVE(r1)
1055 ld rOFF32, rOFF32SAVE(r1)
1060 libc_hidden_builtin_def (memcmp)
1061 weak_alias (memcmp, bcmp)
1062 strong_alias (memcmp, __memcmpeq)
1063 libc_hidden_def (__memcmpeq)