1 /* Optimized memcmp implementation for POWER7/PowerPC32.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* int [r3] memcmp (const char *s1 [r3],
29 EALIGN (BP_SYM(memcmp),4,0)
34 #define rSTR1 r3 /* first string arg */
35 #define rSTR2 r4 /* second string arg */
36 #define rN r5 /* max string length */
37 #define rWORD1 r6 /* current word in s1 */
38 #define rWORD2 r7 /* current word in s2 */
39 #define rWORD3 r8 /* next word in s1 */
40 #define rWORD4 r9 /* next word in s2 */
41 #define rWORD5 r10 /* next word in s1 */
42 #define rWORD6 r11 /* next word in s2 */
43 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
44 #define rWORD7 r30 /* next word in s1 */
45 #define rWORD8 r31 /* next word in s2 */
51 clrlwi rBITDIF,rSTR1,30
53 beq- cr6,L(zeroLength)
57 /* If less than 8 bytes or not aligned, use the unaligned
60 blt cr1,L(bytealigned)
62 cfi_adjust_cfa_offset(64)
64 cfi_offset(31,(48-64))
66 cfi_offset(30,(44-64))
68 /* At this point we know both strings have the same alignment and the
69 compare length is at least 8 bytes. rBITDIF contains the low order
70 2 bits of rSTR1 and cr5 contains the result of the logical compare
71 of rBITDIF to 0. If rBITDIF == 0 then we are already word
72 aligned and can perform the word aligned loop.
74 Otherwise we know the two strings have the same alignment (but not
75 yet word aligned). So we force the string addresses to the next lower
76 word boundary and special case this first word using shift left to
77 eliminate bits preceeding the first byte. Since we want to join the
78 normal (word aligned) compare loop, starting at the second word,
79 we need to adjust the length (rN) and special case the loop
80 versioning for the first word. This insures that the loop count is
81 correct and the first word (shifted) is in the expected register pair. */
89 srwi rTMP,rN,4 /* Divide by 16 */
90 andi. rBITDIF,rN,12 /* Get the word remainder */
104 slw rWORD5,rWORD1,r11
105 slw rWORD6,rWORD2,r11
106 cmplw cr5,rWORD5,rWORD6
108 /* Do something useful in this cycle since we have to branch anyway. */
111 cmplw cr0,rWORD1,rWORD2
116 slw rWORD5,rWORD1,r11
117 slw rWORD6,rWORD2,r11
118 cmplw cr6,rWORD5,rWORD6
120 /* Do something useful in this cycle since we have to branch anyway. */
123 cmplw cr5,rWORD7,rWORD8
125 /* Remainder is 12 */
128 slw rWORD3,rWORD1,r11
129 slw rWORD4,rWORD2,r11
130 cmplw cr1,rWORD3,rWORD4
132 /* Count is a multiple of 16, remainder is 0 */
136 slw rWORD1,rWORD1,r11
137 slw rWORD2,rWORD2,r11
138 cmplw cr0,rWORD1,rWORD2
141 /* At this point we know both strings are word aligned and the
142 compare length is at least 8 bytes. */
145 andi. rBITDIF,rN,12 /* Get the word remainder */
146 srwi rTMP,rN,4 /* Divide by 16 */
158 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
159 (8-15 byte compare), we want to use only volatile registers. This
160 means we can avoid restoring non-volatile registers since we did not
161 change any on the early exit path. The key here is the non-early
162 exit path only cares about the condition code (cr5), not about which
163 register pair was used. */
166 cmplw cr5,rWORD5,rWORD6
170 cmplw cr0,rWORD1,rWORD2
174 cmplw cr1,rWORD3,rWORD4
177 cmplw cr6,rWORD5,rWORD6
181 lwzu rWORD7,16(rSTR1)
182 lwzu rWORD8,16(rSTR2)
184 cmplw cr5,rWORD7,rWORD8
193 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
205 cmplw cr6,rWORD5,rWORD6
209 cmplw cr5,rWORD7,rWORD8
213 cmplw cr0,rWORD1,rWORD2
216 cmplw cr1,rWORD3,rWORD4
222 /* Again we are on a early exit path (16-23 byte compare), we want to
223 only use volatile registers and avoid restoring non-volatile
229 cmplw cr5,rWORD3,rWORD4
235 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
241 /* Remainder is 12 */
247 cmplw cr1,rWORD3,rWORD4
251 cmplw cr6,rWORD5,rWORD6
255 cmplw cr5,rWORD7,rWORD8
258 cmplw cr0,rWORD1,rWORD2
264 /* Again we are on a early exit path (24-31 byte compare), we want to
265 only use volatile registers and avoid restoring non-volatile
271 cmplw cr5,rWORD1,rWORD2
277 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
284 /* Count is a multiple of 16, remainder is 0 */
290 cmplw cr0,rWORD1,rWORD2
294 cmplw cr1,rWORD3,rWORD4
297 cmplw cr6,rWORD5,rWORD6
298 lwzu rWORD7,12(rSTR1)
299 lwzu rWORD8,12(rSTR2)
300 cmplw cr5,rWORD7,rWORD8
303 bdz- L(d24) /* Adjust CTR as we start with +4 */
304 /* This is the primary loop */
309 cmplw cr1,rWORD3,rWORD4
314 cmplw cr6,rWORD5,rWORD6
319 cmplw cr5,rWORD7,rWORD8
322 lwzu rWORD7,16(rSTR1)
323 lwzu rWORD8,16(rSTR2)
325 cmplw cr0,rWORD1,rWORD2
329 cmplw cr1,rWORD3,rWORD4
331 cmplw cr6,rWORD5,rWORD6
333 cmplw cr5,rWORD7,rWORD8
347 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
349 /* At this point we have a remainder of 1 to 3 bytes to compare. Since
350 we are aligned it is safe to load the whole word, and use
351 shift right to eliminate bits beyond the compare length. */
405 cfi_adjust_cfa_offset(-64)
408 /* We need to prime this loop. This loop is swing modulo scheduled
409 to avoid pipe delays. The dependent instruction latencies (load to
410 compare to conditional branch) is 2 to 3 cycles. In this loop each
411 dispatch group ends in a branch and takes 1 cycle. Effectively
412 the first iteration of the loop only serves to load operands and
413 branches based on compares are delayed until the next loop.
415 So we must precondition some registers and condition codes so that
416 we don't exit the loop early on the first iteration. */
420 cmplw cr0,rWORD1,rWORD2
424 cmplw cr1,rWORD3,rWORD4
434 cmplw cr6,rWORD5,rWORD6
441 cmplw cr0,rWORD1,rWORD2
448 cmplw cr1,rWORD3,rWORD4
451 /* We speculatively loading bytes before we have tested the previous
452 bytes. But we must avoid overrunning the length (in the ctr) to
453 prevent these speculative loads from causing a segfault. In this
454 case the loop will exit early (before the all pending bytes are
455 tested. In this case we must complete the pending operations
492 sub rRTN,rWORD5,rWORD6
498 sub rRTN,rWORD3,rWORD4
503 sub rRTN,rWORD1,rWORD2
513 cfi_adjust_cfa_offset(64)
515 /* At this point we know the strings have different alignment and the
516 compare length is at least 8 bytes. rBITDIF contains the low order
517 2 bits of rSTR1 and cr5 contains the result of the logical compare
518 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
519 perform the Wunaligned loop.
521 Otherwise we know that rSTR1 is not aready word aligned yet.
522 So we can force the string addresses to the next lower word
523 boundary and special case this first word using shift left to
524 eliminate bits preceeding the first byte. Since we want to join the
525 normal (Wualigned) compare loop, starting at the second word,
526 we need to adjust the length (rN) and special case the loop
527 versioning for the first W. This insures that the loop count is
528 correct and the first W (shifted) is in the expected resister pair. */
529 #define rSHL r29 /* Unaligned shift left count. */
530 #define rSHR r28 /* Unaligned shift right count. */
531 #define rB r27 /* Left rotation temp for rWORD2. */
532 #define rD r26 /* Left rotation temp for rWORD4. */
533 #define rF r25 /* Left rotation temp for rWORD6. */
534 #define rH r24 /* Left rotation temp for rWORD8. */
535 #define rA r0 /* Right rotation temp for rWORD2. */
536 #define rC r12 /* Right rotation temp for rWORD4. */
537 #define rE r0 /* Right rotation temp for rWORD6. */
538 #define rG r12 /* Right rotation temp for rWORD8. */
541 cfi_offset(r29,(40-64))
544 cfi_offset(r28,(36-64))
545 beq cr5,L(Wunaligned)
547 cfi_offset(r27,(32-64))
548 /* Adjust the logical start of rSTR2 to compensate for the extra bits
549 in the 1st rSTR1 W. */
550 sub r27,rSTR2,rBITDIF
551 /* But do not attempt to address the W before that W that contains
552 the actual start of rSTR2. */
555 cfi_offset(r26,(28-64))
556 /* Compute the left/right shift counts for the unalign rSTR2,
557 compensating for the logical (W aligned) start of rSTR1. */
561 cfi_offset(r25,(24-64))
567 cfi_offset(r24,(20-64))
569 srwi rTMP,rN,4 /* Divide by 16 */
570 andi. rBITDIF,rN,12 /* Get the W remainder */
571 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
572 this special case those bits may be discarded anyway. Also we
573 must avoid loading a W where none of the bits are part of rSTR2 as
574 this may cross a page boundary and cause a page fault. */
579 slw rWORD8,rWORD8,rSHL
598 slw rWORD7,rWORD1,r11
599 slw rWORD8,rWORD8,r11
601 /* At this point we exit early with the first word compare
602 complete and remainder of 0 to 3 bytes. See L(du14) for details on
603 how we handle the remaining bytes. */
604 cmplw cr5,rWORD7,rWORD8
618 slw rWORD5,rWORD1,r11
619 slw rWORD6,rWORD8,r11
621 /* Remainder is 12 */
625 slw rWORD3,rWORD1,r11
626 slw rWORD4,rWORD8,r11
628 /* Count is a multiple of 16, remainder is 0 */
634 slw rWORD1,rWORD1,r11
635 slw rWORD2,rWORD8,r11
638 /* At this point we know rSTR1 is word aligned and the
639 compare length is at least 8 bytes. */
643 cfi_offset(r27,(32-64))
646 cfi_offset(r26,(28-64))
647 srwi rTMP,rN,4 /* Divide by 16 */
649 cfi_offset(r25,(24-64))
650 andi. rBITDIF,rN,12 /* Get the W remainder */
652 cfi_offset(r24,(24-64))
677 cmplw cr5,rWORD7,rWORD8
683 cmplw cr0,rWORD1,rWORD2
690 cmplw cr1,rWORD3,rWORD4
695 cmplw cr6,rWORD5,rWORD6
698 /* At this point we exit early with the first word compare
699 complete and remainder of 0 to 3 bytes. See L(du14) for details on
700 how we handle the remaining bytes. */
702 cmplw cr5,rWORD7,rWORD8
722 cmplw cr6,rWORD5,rWORD6
729 cmplw cr5,rWORD7,rWORD8
736 cmplw cr0,rWORD1,rWORD2
743 cmplw cr1,rWORD3,rWORD4
747 cmplw cr5,rWORD7,rWORD8
761 /* Remainder is 12 */
771 cmplw cr1,rWORD3,rWORD4
777 cmplw cr6,rWORD5,rWORD6
785 cmplw cr5,rWORD7,rWORD8
792 cmplw cr0,rWORD1,rWORD2
799 cmplw cr5,rWORD7,rWORD8
811 /* Count is a multiple of 16, remainder is 0 */
822 cmplw cr0,rWORD1,rWORD2
828 cmplw cr1,rWORD3,rWORD4
833 lwzu rWORD7,12(rSTR1)
834 lwzu rWORD8,12(rSTR2)
835 cmplw cr6,rWORD5,rWORD6
840 cmplw cr5,rWORD7,rWORD8
841 bdz L(du24) /* Adjust CTR as we start with +4 */
842 /* This is the primary loop */
847 cmplw cr1,rWORD3,rWORD4
855 cmplw cr6,rWORD5,rWORD6
863 cmplw cr5,rWORD7,rWORD8
869 lwzu rWORD7,16(rSTR1)
870 lwzu rWORD8,16(rSTR2)
871 cmplw cr0,rWORD1,rWORD2
880 cmplw cr1,rWORD3,rWORD4
882 cmplw cr6,rWORD5,rWORD6
884 cmplw cr5,rWORD7,rWORD8
894 /* At this point we have a remainder of 1 to 3 bytes to compare. We use
895 shift right to eliminate bits beyond the compare length.
897 However it may not be safe to load rWORD2 which may be beyond the
898 string length. So we compare the bit length of the remainder to
899 the right shift count (rSHR). If the bit count is less than or equal
900 we do not need to load rWORD2 (all significant bits are already in
912 subfic rN,rN,32 /* Shift count is 32 - (rN * 8). */
932 bgt cr0,L(dureturn29)
942 bgt cr1,L(dureturn29)
952 bgt cr6,L(dureturn29)
962 bgt cr5,L(dureturn29)
986 END (BP_SYM (memcmp))
987 libc_hidden_builtin_def (memcmp)
988 weak_alias (memcmp,bcmp)