1 /* Optimized memcmp implementation for POWER7/PowerPC32.
2 Copyright (C) 2010-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 /* int [r3] memcmp (const char *s1 [r3],
28 EALIGN (BP_SYM(memcmp),4,0)
33 #define rSTR1 r3 /* first string arg */
34 #define rSTR2 r4 /* second string arg */
35 #define rN r5 /* max string length */
36 #define rWORD1 r6 /* current word in s1 */
37 #define rWORD2 r7 /* current word in s2 */
38 #define rWORD3 r8 /* next word in s1 */
39 #define rWORD4 r9 /* next word in s2 */
40 #define rWORD5 r10 /* next word in s1 */
41 #define rWORD6 r11 /* next word in s2 */
42 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
43 #define rWORD7 r30 /* next word in s1 */
44 #define rWORD8 r31 /* next word in s2 */
50 clrlwi rBITDIF,rSTR1,30
52 beq- cr6,L(zeroLength)
56 /* If less than 8 bytes or not aligned, use the unaligned
59 blt cr1,L(bytealigned)
61 cfi_adjust_cfa_offset(64)
63 cfi_offset(31,(48-64))
65 cfi_offset(30,(44-64))
67 /* At this point we know both strings have the same alignment and the
68 compare length is at least 8 bytes. rBITDIF contains the low order
69 2 bits of rSTR1 and cr5 contains the result of the logical compare
70 of rBITDIF to 0. If rBITDIF == 0 then we are already word
71 aligned and can perform the word aligned loop.
73 Otherwise we know the two strings have the same alignment (but not
74 yet word aligned). So we force the string addresses to the next lower
75 word boundary and special case this first word using shift left to
76 eliminate bits preceeding the first byte. Since we want to join the
77 normal (word aligned) compare loop, starting at the second word,
78 we need to adjust the length (rN) and special case the loop
79 versioning for the first word. This insures that the loop count is
80 correct and the first word (shifted) is in the expected register pair. */
88 srwi rTMP,rN,4 /* Divide by 16 */
89 andi. rBITDIF,rN,12 /* Get the word remainder */
103 slw rWORD5,rWORD1,r11
104 slw rWORD6,rWORD2,r11
105 cmplw cr5,rWORD5,rWORD6
107 /* Do something useful in this cycle since we have to branch anyway. */
110 cmplw cr0,rWORD1,rWORD2
115 slw rWORD5,rWORD1,r11
116 slw rWORD6,rWORD2,r11
117 cmplw cr6,rWORD5,rWORD6
119 /* Do something useful in this cycle since we have to branch anyway. */
122 cmplw cr5,rWORD7,rWORD8
124 /* Remainder is 12 */
127 slw rWORD3,rWORD1,r11
128 slw rWORD4,rWORD2,r11
129 cmplw cr1,rWORD3,rWORD4
131 /* Count is a multiple of 16, remainder is 0 */
135 slw rWORD1,rWORD1,r11
136 slw rWORD2,rWORD2,r11
137 cmplw cr0,rWORD1,rWORD2
140 /* At this point we know both strings are word aligned and the
141 compare length is at least 8 bytes. */
144 andi. rBITDIF,rN,12 /* Get the word remainder */
145 srwi rTMP,rN,4 /* Divide by 16 */
157 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
158 (8-15 byte compare), we want to use only volatile registers. This
159 means we can avoid restoring non-volatile registers since we did not
160 change any on the early exit path. The key here is the non-early
161 exit path only cares about the condition code (cr5), not about which
162 register pair was used. */
165 cmplw cr5,rWORD5,rWORD6
169 cmplw cr0,rWORD1,rWORD2
173 cmplw cr1,rWORD3,rWORD4
176 cmplw cr6,rWORD5,rWORD6
180 lwzu rWORD7,16(rSTR1)
181 lwzu rWORD8,16(rSTR2)
183 cmplw cr5,rWORD7,rWORD8
192 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
204 cmplw cr6,rWORD5,rWORD6
208 cmplw cr5,rWORD7,rWORD8
212 cmplw cr0,rWORD1,rWORD2
215 cmplw cr1,rWORD3,rWORD4
221 /* Again we are on a early exit path (16-23 byte compare), we want to
222 only use volatile registers and avoid restoring non-volatile
228 cmplw cr5,rWORD3,rWORD4
234 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
240 /* Remainder is 12 */
246 cmplw cr1,rWORD3,rWORD4
250 cmplw cr6,rWORD5,rWORD6
254 cmplw cr5,rWORD7,rWORD8
257 cmplw cr0,rWORD1,rWORD2
263 /* Again we are on a early exit path (24-31 byte compare), we want to
264 only use volatile registers and avoid restoring non-volatile
270 cmplw cr5,rWORD1,rWORD2
276 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
283 /* Count is a multiple of 16, remainder is 0 */
289 cmplw cr0,rWORD1,rWORD2
293 cmplw cr1,rWORD3,rWORD4
296 cmplw cr6,rWORD5,rWORD6
297 lwzu rWORD7,12(rSTR1)
298 lwzu rWORD8,12(rSTR2)
299 cmplw cr5,rWORD7,rWORD8
302 bdz- L(d24) /* Adjust CTR as we start with +4 */
303 /* This is the primary loop */
308 cmplw cr1,rWORD3,rWORD4
313 cmplw cr6,rWORD5,rWORD6
318 cmplw cr5,rWORD7,rWORD8
321 lwzu rWORD7,16(rSTR1)
322 lwzu rWORD8,16(rSTR2)
324 cmplw cr0,rWORD1,rWORD2
328 cmplw cr1,rWORD3,rWORD4
330 cmplw cr6,rWORD5,rWORD6
332 cmplw cr5,rWORD7,rWORD8
346 subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
348 /* At this point we have a remainder of 1 to 3 bytes to compare. Since
349 we are aligned it is safe to load the whole word, and use
350 shift right to eliminate bits beyond the compare length. */
404 cfi_adjust_cfa_offset(-64)
407 /* We need to prime this loop. This loop is swing modulo scheduled
408 to avoid pipe delays. The dependent instruction latencies (load to
409 compare to conditional branch) is 2 to 3 cycles. In this loop each
410 dispatch group ends in a branch and takes 1 cycle. Effectively
411 the first iteration of the loop only serves to load operands and
412 branches based on compares are delayed until the next loop.
414 So we must precondition some registers and condition codes so that
415 we don't exit the loop early on the first iteration. */
419 cmplw cr0,rWORD1,rWORD2
423 cmplw cr1,rWORD3,rWORD4
433 cmplw cr6,rWORD5,rWORD6
440 cmplw cr0,rWORD1,rWORD2
447 cmplw cr1,rWORD3,rWORD4
450 /* We speculatively loading bytes before we have tested the previous
451 bytes. But we must avoid overrunning the length (in the ctr) to
452 prevent these speculative loads from causing a segfault. In this
453 case the loop will exit early (before the all pending bytes are
454 tested. In this case we must complete the pending operations
491 sub rRTN,rWORD5,rWORD6
497 sub rRTN,rWORD3,rWORD4
502 sub rRTN,rWORD1,rWORD2
512 cfi_adjust_cfa_offset(64)
514 /* At this point we know the strings have different alignment and the
515 compare length is at least 8 bytes. rBITDIF contains the low order
516 2 bits of rSTR1 and cr5 contains the result of the logical compare
517 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
518 perform the Wunaligned loop.
520 Otherwise we know that rSTR1 is not aready word aligned yet.
521 So we can force the string addresses to the next lower word
522 boundary and special case this first word using shift left to
523 eliminate bits preceeding the first byte. Since we want to join the
524 normal (Wualigned) compare loop, starting at the second word,
525 we need to adjust the length (rN) and special case the loop
526 versioning for the first W. This insures that the loop count is
527 correct and the first W (shifted) is in the expected resister pair. */
528 #define rSHL r29 /* Unaligned shift left count. */
529 #define rSHR r28 /* Unaligned shift right count. */
530 #define rB r27 /* Left rotation temp for rWORD2. */
531 #define rD r26 /* Left rotation temp for rWORD4. */
532 #define rF r25 /* Left rotation temp for rWORD6. */
533 #define rH r24 /* Left rotation temp for rWORD8. */
534 #define rA r0 /* Right rotation temp for rWORD2. */
535 #define rC r12 /* Right rotation temp for rWORD4. */
536 #define rE r0 /* Right rotation temp for rWORD6. */
537 #define rG r12 /* Right rotation temp for rWORD8. */
540 cfi_offset(r29,(40-64))
543 cfi_offset(r28,(36-64))
544 beq cr5,L(Wunaligned)
546 cfi_offset(r27,(32-64))
547 /* Adjust the logical start of rSTR2 to compensate for the extra bits
548 in the 1st rSTR1 W. */
549 sub r27,rSTR2,rBITDIF
550 /* But do not attempt to address the W before that W that contains
551 the actual start of rSTR2. */
554 cfi_offset(r26,(28-64))
555 /* Compute the left/right shift counts for the unalign rSTR2,
556 compensating for the logical (W aligned) start of rSTR1. */
560 cfi_offset(r25,(24-64))
566 cfi_offset(r24,(20-64))
568 srwi rTMP,rN,4 /* Divide by 16 */
569 andi. rBITDIF,rN,12 /* Get the W remainder */
570 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
571 this special case those bits may be discarded anyway. Also we
572 must avoid loading a W where none of the bits are part of rSTR2 as
573 this may cross a page boundary and cause a page fault. */
578 slw rWORD8,rWORD8,rSHL
597 slw rWORD7,rWORD1,r11
598 slw rWORD8,rWORD8,r11
600 /* At this point we exit early with the first word compare
601 complete and remainder of 0 to 3 bytes. See L(du14) for details on
602 how we handle the remaining bytes. */
603 cmplw cr5,rWORD7,rWORD8
617 slw rWORD5,rWORD1,r11
618 slw rWORD6,rWORD8,r11
620 /* Remainder is 12 */
624 slw rWORD3,rWORD1,r11
625 slw rWORD4,rWORD8,r11
627 /* Count is a multiple of 16, remainder is 0 */
633 slw rWORD1,rWORD1,r11
634 slw rWORD2,rWORD8,r11
637 /* At this point we know rSTR1 is word aligned and the
638 compare length is at least 8 bytes. */
642 cfi_offset(r27,(32-64))
645 cfi_offset(r26,(28-64))
646 srwi rTMP,rN,4 /* Divide by 16 */
648 cfi_offset(r25,(24-64))
649 andi. rBITDIF,rN,12 /* Get the W remainder */
651 cfi_offset(r24,(24-64))
676 cmplw cr5,rWORD7,rWORD8
682 cmplw cr0,rWORD1,rWORD2
689 cmplw cr1,rWORD3,rWORD4
694 cmplw cr6,rWORD5,rWORD6
697 /* At this point we exit early with the first word compare
698 complete and remainder of 0 to 3 bytes. See L(du14) for details on
699 how we handle the remaining bytes. */
701 cmplw cr5,rWORD7,rWORD8
721 cmplw cr6,rWORD5,rWORD6
728 cmplw cr5,rWORD7,rWORD8
735 cmplw cr0,rWORD1,rWORD2
742 cmplw cr1,rWORD3,rWORD4
746 cmplw cr5,rWORD7,rWORD8
760 /* Remainder is 12 */
770 cmplw cr1,rWORD3,rWORD4
776 cmplw cr6,rWORD5,rWORD6
784 cmplw cr5,rWORD7,rWORD8
791 cmplw cr0,rWORD1,rWORD2
798 cmplw cr5,rWORD7,rWORD8
810 /* Count is a multiple of 16, remainder is 0 */
821 cmplw cr0,rWORD1,rWORD2
827 cmplw cr1,rWORD3,rWORD4
832 lwzu rWORD7,12(rSTR1)
833 lwzu rWORD8,12(rSTR2)
834 cmplw cr6,rWORD5,rWORD6
839 cmplw cr5,rWORD7,rWORD8
840 bdz L(du24) /* Adjust CTR as we start with +4 */
841 /* This is the primary loop */
846 cmplw cr1,rWORD3,rWORD4
854 cmplw cr6,rWORD5,rWORD6
862 cmplw cr5,rWORD7,rWORD8
868 lwzu rWORD7,16(rSTR1)
869 lwzu rWORD8,16(rSTR2)
870 cmplw cr0,rWORD1,rWORD2
879 cmplw cr1,rWORD3,rWORD4
881 cmplw cr6,rWORD5,rWORD6
883 cmplw cr5,rWORD7,rWORD8
893 /* At this point we have a remainder of 1 to 3 bytes to compare. We use
894 shift right to eliminate bits beyond the compare length.
896 However it may not be safe to load rWORD2 which may be beyond the
897 string length. So we compare the bit length of the remainder to
898 the right shift count (rSHR). If the bit count is less than or equal
899 we do not need to load rWORD2 (all significant bits are already in
911 subfic rN,rN,32 /* Shift count is 32 - (rN * 8). */
931 bgt cr0,L(dureturn29)
941 bgt cr1,L(dureturn29)
951 bgt cr6,L(dureturn29)
961 bgt cr5,L(dureturn29)
985 END (BP_SYM (memcmp))
986 libc_hidden_builtin_def (memcmp)
987 weak_alias (memcmp,bcmp)