1 /* Optimized strcmp implementation for PowerPC64.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 /* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
26 EALIGN (BP_SYM(memcmp), 4, 0)
31 #define rSTR1 r3 /* first string arg */
32 #define rSTR2 r4 /* second string arg */
33 #define rN r5 /* max string length */
34 /* Note: The Bounded pointer support in this code is broken. This code
35 was inherited from PPC32 and that support was never completed.
36 Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
37 #define rWORD1 r6 /* current word in s1 */
38 #define rWORD2 r7 /* current word in s2 */
39 #define rWORD3 r8 /* next word in s1 */
40 #define rWORD4 r9 /* next word in s2 */
41 #define rWORD5 r10 /* next word in s1 */
42 #define rWORD6 r11 /* next word in s2 */
43 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
44 #define rWORD7 r30 /* next word in s1 */
45 #define rWORD8 r31 /* next word in s2 */
47 xor rTMP, rSTR2, rSTR1
50 clrldi. rTMP, rTMP, 61
51 clrldi rBITDIF, rSTR1, 61
52 cmpldi cr5, rBITDIF, 0
53 beq- cr6, L(zeroLength)
56 /* If less than 8 bytes or not aligned, use the unaligned
58 blt cr1, L(bytealigned)
62 cfi_offset(rWORD7,-16)
64 /* At this point we know both strings have the same alignment and the
65 compare length is at least 8 bytes. rBITDIF contains the low order
66 3 bits of rSTR1 and cr5 contains the result of the logical compare
67 of rBITDIF to 0. If rBITDIF == 0 then we are already double word
68 aligned and can perform the DWaligned loop.
70 Otherwise we know the two strings have the same alignment (but not
71 yet DW). So we can force the string addresses to the next lower DW
72 boundary and special case this first DW word using shift left to
73 eliminate bits preceding the first byte. Since we want to join the
74 normal (DWaligned) compare loop, starting at the second double word,
75 we need to adjust the length (rN) and special case the loop
76 versioning for the first DW. This insures that the loop count is
77 correct and the first DW (shifted) is in the expected resister pair. */
80 clrrdi rSTR1, rSTR1, 3
81 clrrdi rSTR2, rSTR2, 3
85 srdi rTMP, rN, 5 /* Divide by 32 */
86 andi. rBITDIF, rN, 24 /* Get the DW remainder */
89 cmpldi cr1, rBITDIF, 16
93 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
100 sld rWORD5, rWORD1, r11
101 sld rWORD6, rWORD2, r11
102 cmpld cr5, rWORD5, rWORD6
104 /* Do something useful in this cycle since we have to branch anyway. */
107 cmpld cr0, rWORD1, rWORD2
109 /* Remainder is 16 */
112 sld rWORD5, rWORD1, r11
113 sld rWORD6, rWORD2, r11
114 cmpld cr6, rWORD5, rWORD6
116 /* Do something useful in this cycle since we have to branch anyway. */
119 cmpld cr5, rWORD7, rWORD8
121 /* Remainder is 24 */
124 sld rWORD3, rWORD1, r11
125 sld rWORD4, rWORD2, r11
126 cmpld cr1, rWORD3, rWORD4
128 /* Count is a multiple of 32, remainder is 0 */
131 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
132 sld rWORD1, rWORD1, r11
133 sld rWORD2, rWORD2, r11
134 cmpld cr0, rWORD1, rWORD2
137 /* At this point we know both strings are double word aligned and the
138 compare length is at least 8 bytes. */
141 andi. rBITDIF, rN, 24 /* Get the DW remainder */
142 srdi rTMP, rN, 5 /* Divide by 32 */
143 cmpldi cr1, rBITDIF, 16
153 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
154 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
155 (8-15 byte compare), we want to use only volatile registers. This
156 means we can avoid restoring non-volatile registers since we did not
157 change any on the early exit path. The key here is the non-early
158 exit path only cares about the condition code (cr5), not about which
159 register pair was used. */
162 cmpld cr5, rWORD5, rWORD6
166 cmpld cr0, rWORD1, rWORD2
170 cmpld cr1, rWORD3, rWORD4
173 cmpld cr6, rWORD5, rWORD6
177 ldu rWORD7, 32(rSTR1)
178 ldu rWORD8, 32(rSTR2)
180 cmpld cr5, rWORD7, rWORD8
189 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
194 /* Remainder is 16 */
197 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
200 cmpld cr6, rWORD5, rWORD6
204 cmpld cr5, rWORD7, rWORD8
208 cmpld cr0, rWORD1, rWORD2
211 cmpld cr1, rWORD3, rWORD4
217 /* Again we are on a early exit path (16-23 byte compare), we want to
218 only use volatile registers and avoid restoring non-volatile
224 cmpld cr5, rWORD3, rWORD4
230 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
235 /* Remainder is 24 */
238 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
241 cmpld cr1, rWORD3, rWORD4
245 cmpld cr6, rWORD5, rWORD6
249 cmpld cr5, rWORD7, rWORD8
252 cmpld cr0, rWORD1, rWORD2
253 addi rSTR1, rSTR1, 16
254 addi rSTR2, rSTR2, 16
258 /* Again we are on a early exit path (24-31 byte compare), we want to
259 only use volatile registers and avoid restoring non-volatile
265 cmpld cr5, rWORD1, rWORD2
268 addi rSTR1, rSTR1, 16
269 addi rSTR2, rSTR2, 16
271 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
277 /* Count is a multiple of 32, remainder is 0 */
280 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
283 cmpld cr0, rWORD1, rWORD2
287 cmpld cr1, rWORD3, rWORD4
290 cmpld cr6, rWORD5, rWORD6
291 ldu rWORD7, 24(rSTR1)
292 ldu rWORD8, 24(rSTR2)
293 cmpld cr5, rWORD7, rWORD8
296 bdz- L(d24) /* Adjust CTR as we start with +4 */
297 /* This is the primary loop */
302 cmpld cr1, rWORD3, rWORD4
307 cmpld cr6, rWORD5, rWORD6
312 cmpld cr5, rWORD7, rWORD8
315 ldu rWORD7, 32(rSTR1)
316 ldu rWORD8, 32(rSTR2)
318 cmpld cr0, rWORD1, rWORD2
322 cmpld cr1, rWORD3, rWORD4
324 cmpld cr6, rWORD5, rWORD6
326 cmpld cr5, rWORD7, rWORD8
339 subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
341 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
342 we are aligned it is safe to load the whole double word, and use
343 shift right double to eliminate bits beyond the compare length. */
347 srd rWORD1, rWORD1, rN
348 srd rWORD2, rWORD2, rN
349 cmpld cr5, rWORD1, rWORD2
389 mtctr rN /* Power4 wants mtctr 1st in dispatch group */
390 beq- cr6, L(zeroLength)
392 /* We need to prime this loop. This loop is swing modulo scheduled
393 to avoid pipe delays. The dependent instruction latencies (load to
394 compare to conditional branch) is 2 to 3 cycles. In this loop each
395 dispatch group ends in a branch and takes 1 cycle. Effectively
396 the first iteration of the loop only serves to load operands and
397 branches based on compares are delayed until the next loop.
399 So we must precondition some registers and condition codes so that
400 we don't exit the loop early on the first iteration. */
405 cmpld cr0, rWORD1, rWORD2
409 cmpld cr1, rWORD3, rWORD4
410 lbzu rWORD5, 2(rSTR1)
411 lbzu rWORD6, 2(rSTR2)
415 lbzu rWORD1, 1(rSTR1)
416 lbzu rWORD2, 1(rSTR2)
419 cmpld cr6, rWORD5, rWORD6
422 lbzu rWORD3, 1(rSTR1)
423 lbzu rWORD4, 1(rSTR2)
426 cmpld cr0, rWORD1, rWORD2
429 lbzu rWORD5, 1(rSTR1)
430 lbzu rWORD6, 1(rSTR2)
433 cmpld cr1, rWORD3, rWORD4
436 /* We speculatively loading bytes before we have tested the previous
437 bytes. But we must avoid overrunning the length (in the ctr) to
438 prevent these speculative loads from causing a segfault. In this
439 case the loop will exit early (before the all pending bytes are
440 tested. In this case we must complete the pending operations
477 sub rRTN, rWORD5, rWORD6
483 sub rRTN, rWORD3, rWORD4
487 sub rRTN, rWORD1, rWORD2
498 /* At this point we know the strings have different alignment and the
499 compare length is at least 8 bytes. rBITDIF contains the low order
500 3 bits of rSTR1 and cr5 contains the result of the logical compare
501 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
502 aligned and can perform the DWunaligned loop.
504 Otherwise we know that rSTR1 is not already DW aligned yet.
505 So we can force the string addresses to the next lower DW
506 boundary and special case this first DW word using shift left to
507 eliminate bits preceding the first byte. Since we want to join the
508 normal (DWaligned) compare loop, starting at the second double word,
509 we need to adjust the length (rN) and special case the loop
510 versioning for the first DW. This insures that the loop count is
511 correct and the first DW (shifted) is in the expected resister pair. */
512 #define rSHL r29 /* Unaligned shift left count. */
513 #define rSHR r28 /* Unaligned shift right count. */
514 #define rB r27 /* Left rotation temp for rWORD2. */
515 #define rD r26 /* Left rotation temp for rWORD4. */
516 #define rF r25 /* Left rotation temp for rWORD6. */
517 #define rH r24 /* Left rotation temp for rWORD8. */
518 #define rA r0 /* Right rotation temp for rWORD2. */
519 #define rC r12 /* Right rotation temp for rWORD4. */
520 #define rE r0 /* Right rotation temp for rWORD6. */
521 #define rG r12 /* Right rotation temp for rWORD8. */
525 clrldi rSHL, rSTR2, 61
526 beq- cr6, L(duzeroLength)
529 beq cr5, L(DWunaligned)
532 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
533 in the 1st rSTR1 DW. */
534 sub r27, rSTR2, rBITDIF
535 /* But do not attempt to address the DW before that DW that contains
536 the actual start of rSTR2. */
537 clrrdi rSTR2, rSTR2, 3
540 /* Compute the left/right shift counts for the unalign rSTR2,
541 compensating for the logical (DW aligned) start of rSTR1. */
543 clrrdi rSTR1, rSTR1, 3
547 cmpld cr5, r27, rSTR2
552 subfic rSHR, rSHL, 64
553 srdi rTMP, rN, 5 /* Divide by 32 */
554 andi. rBITDIF, rN, 24 /* Get the DW remainder */
555 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
556 this special case those bits may be discarded anyway. Also we
557 must avoid loading a DW where none of the bits are part of rSTR2 as
558 this may cross a page boundary and cause a page fault. */
563 sld rWORD8, rWORD8, rSHL
568 cmpldi cr1, rBITDIF, 16
573 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
574 or rWORD8, rG, rWORD8
582 sld rWORD7, rWORD1, r11
583 sld rWORD8, rWORD8, r11
585 /* At this point we exit early with the first double word compare
586 complete and remainder of 0 to 7 bytes. See L(du14) for details on
587 how we handle the remaining bytes. */
588 cmpld cr5, rWORD7, rWORD8
598 /* Remainder is 16 */
602 sld rWORD5, rWORD1, r11
603 sld rWORD6, rWORD8, r11
605 /* Remainder is 24 */
609 sld rWORD3, rWORD1, r11
610 sld rWORD4, rWORD8, r11
612 /* Count is a multiple of 32, remainder is 0 */
615 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
616 or rWORD8, rG, rWORD8
618 sld rWORD1, rWORD1, r11
619 sld rWORD2, rWORD8, r11
622 /* At this point we know rSTR1 is double word aligned and the
623 compare length is at least 8 bytes. */
628 clrrdi rSTR2, rSTR2, 3
631 srdi rTMP, rN, 5 /* Divide by 32 */
634 andi. rBITDIF, rN, 24 /* Get the DW remainder */
640 cmpldi cr1, rBITDIF, 16
643 subfic rSHR, rSHL, 64
646 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
661 cmpld cr5, rWORD7, rWORD8
667 cmpld cr0, rWORD1, rWORD2
674 cmpld cr1, rWORD3, rWORD4
679 cmpld cr6, rWORD5, rWORD6
682 /* At this point we exit early with the first double word compare
683 complete and remainder of 0 to 7 bytes. See L(du14) for details on
684 how we handle the remaining bytes. */
686 cmpld cr5, rWORD7, rWORD8
696 /* Remainder is 16 */
706 cmpld cr6, rWORD5, rWORD6
713 cmpld cr5, rWORD7, rWORD8
720 cmpld cr0, rWORD1, rWORD2
727 cmpld cr1, rWORD3, rWORD4
731 cmpld cr5, rWORD7, rWORD8
745 /* Remainder is 24 */
755 cmpld cr1, rWORD3, rWORD4
761 cmpld cr6, rWORD5, rWORD6
769 cmpld cr5, rWORD7, rWORD8
774 addi rSTR1, rSTR1, 16
775 addi rSTR2, rSTR2, 16
776 cmpld cr0, rWORD1, rWORD2
780 addi rSTR1, rSTR1, 16
781 addi rSTR2, rSTR2, 16
783 cmpld cr5, rWORD7, rWORD8
795 /* Count is a multiple of 32, remainder is 0 */
798 mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
806 cmpld cr0, rWORD1, rWORD2
812 cmpld cr1, rWORD3, rWORD4
817 ldu rWORD7, 24(rSTR1)
818 ldu rWORD8, 24(rSTR2)
819 cmpld cr6, rWORD5, rWORD6
824 cmpld cr5, rWORD7, rWORD8
825 bdz- L(du24) /* Adjust CTR as we start with +4 */
826 /* This is the primary loop */
831 cmpld cr1, rWORD3, rWORD4
839 cmpld cr6, rWORD5, rWORD6
847 cmpld cr5, rWORD7, rWORD8
853 ldu rWORD7, 32(rSTR1)
854 ldu rWORD8, 32(rSTR2)
855 cmpld cr0, rWORD1, rWORD2
864 cmpld cr1, rWORD3, rWORD4
866 cmpld cr6, rWORD5, rWORD6
868 cmpld cr5, rWORD7, rWORD8
878 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
879 shift right double to eliminate bits beyond the compare length.
880 This allows the use of double word subtract to compute the final
883 However it may not be safe to load rWORD2 which may be beyond the
884 string length. So we compare the bit length of the remainder to
885 the right shift count (rSHR). If the bit count is less than or equal
886 we do not need to load rWORD2 (all significant bits are already in
898 subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
902 srd rWORD1, rWORD1, rN
903 srd rWORD2, rWORD2, rN
907 cmpld cr0, rWORD1, rWORD2
910 beq cr0, L(dureturn24)
921 bgt cr0, L(dureturn29)
931 bgt cr1, L(dureturn29)
941 bgt cr6, L(dureturn29)
951 bgt cr5, L(dureturn29)
979 END (BP_SYM (memcmp))
980 libc_hidden_builtin_def (memcmp)
981 weak_alias (memcmp, bcmp)