1 /* Optimized memcmp implementation for POWER7/PowerPC64.
2 Copyright (C) 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
24 /* int [r3] memcmp (const char *s1 [r3],
29 EALIGN (BP_SYM(memcmp),4,0)
34 #define rSTR1 r3 /* first string arg */
35 #define rSTR2 r4 /* second string arg */
36 #define rN r5 /* max string length */
37 /* Note: The Bounded pointer support in this code is broken. This code
38 was inherited from PPC32 and and that support was never completed.
39 Current PPC gcc does not support -fbounds-check or -fbounded-pointers. */
40 #define rWORD1 r6 /* current word in s1 */
41 #define rWORD2 r7 /* current word in s2 */
42 #define rWORD3 r8 /* next word in s1 */
43 #define rWORD4 r9 /* next word in s2 */
44 #define rWORD5 r10 /* next word in s1 */
45 #define rWORD6 r11 /* next word in s2 */
46 #define rBITDIF r12 /* bits that differ in s1 & s2 words */
47 #define rWORD7 r30 /* next word in s1 */
48 #define rWORD8 r31 /* next word in s2 */
54 clrldi rBITDIF,rSTR1,61
56 beq- cr6,L(zeroLength)
59 /* If less than 8 bytes or not aligned, use the unalligned
61 blt cr1,L(bytealigned)
65 cfi_offset(rWORD7,-16)
67 /* At this point we know both strings have the same alignment and the
68 compare length is at least 8 bytes. rBITDIF containes the low order
69 3 bits of rSTR1 and cr5 contains the result of the logical compare
70 of rBITDIF to 0. If rBITDIF == 0 then we are already double word
71 aligned and can perform the DWaligned loop.
73 Otherwise we know the two strings have the same alignment (but not
74 yet DW). So we can force the string addresses to the next lower DW
75 boundary and special case this first DW word using shift left to
76 ellimiate bits preceeding the first byte. Since we want to join the
77 normal (DWaligned) compare loop, starting at the second double word,
78 we need to adjust the length (rN) and special case the loop
79 versioning for the first DW. This insures that the loop count is
80 correct and the first DW (shifted) is in the expected resister pair. */
88 srdi rTMP,rN,5 /* Divide by 32 */
89 andi. rBITDIF,rN,24 /* Get the DW remainder */
103 sld rWORD5,rWORD1,r11
104 sld rWORD6,rWORD2,r11
105 cmpld cr5,rWORD5,rWORD6
107 /* Do something useful in this cycle since we have to branch anyway. */
110 cmpld cr0,rWORD1,rWORD2
112 /* Remainder is 16 */
115 sld rWORD5,rWORD1,r11
116 sld rWORD6,rWORD2,r11
117 cmpld cr6,rWORD5,rWORD6
119 /* Do something useful in this cycle since we have to branch anyway. */
122 cmpld cr5,rWORD7,rWORD8
124 /* Remainder is 24 */
127 sld rWORD3,rWORD1,r11
128 sld rWORD4,rWORD2,r11
129 cmpld cr1,rWORD3,rWORD4
131 /* Count is a multiple of 32, remainder is 0 */
135 sld rWORD1,rWORD1,r11
136 sld rWORD2,rWORD2,r11
137 cmpld cr0,rWORD1,rWORD2
140 /* At this point we know both strings are double word aligned and the
141 compare length is at least 8 bytes. */
144 andi. rBITDIF,rN,24 /* Get the DW remainder */
145 srdi rTMP,rN,5 /* Divide by 32 */
146 cmpldi cr1,rBITDIF,16
157 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
158 (8-15 byte compare), we want to use only volitile registers. This
159 means we can avoid restoring non-volitile registers since we did not
160 change any on the early exit path. The key here is the non-early
161 exit path only cares about the condition code (cr5), not about which
162 register pair was used. */
165 cmpld cr5,rWORD5,rWORD6
169 cmpld cr0,rWORD1,rWORD2
173 cmpld cr1,rWORD3,rWORD4
176 cmpld cr6,rWORD5,rWORD6
183 cmpld cr5,rWORD7,rWORD8
192 subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
197 /* Remainder is 16 */
203 cmpld cr6,rWORD5,rWORD6
207 cmpld cr5,rWORD7,rWORD8
211 cmpld cr0,rWORD1,rWORD2
214 cmpld cr1,rWORD3,rWORD4
220 /* Again we are on a early exit path (16-23 byte compare), we want to
221 only use volitile registers and avoid restoring non-volitile
227 cmpld cr5,rWORD3,rWORD4
233 subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
238 /* Remainder is 24 */
244 cmpld cr1,rWORD3,rWORD4
248 cmpld cr6,rWORD5,rWORD6
252 cmpld cr5,rWORD7,rWORD8
255 cmpld cr0,rWORD1,rWORD2
261 /* Again we are on a early exit path (24-31 byte compare), we want to
262 only use volitile registers and avoid restoring non-volitile
268 cmpld cr5,rWORD1,rWORD2
274 subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
280 /* Count is a multiple of 32, remainder is 0 */
286 cmpld cr0,rWORD1,rWORD2
290 cmpld cr1,rWORD3,rWORD4
293 cmpld cr6,rWORD5,rWORD6
296 cmpld cr5,rWORD7,rWORD8
299 bdz- L(d24) /* Adjust CTR as we start with +4 */
300 /* This is the primary loop */
305 cmpld cr1,rWORD3,rWORD4
310 cmpld cr6,rWORD5,rWORD6
315 cmpld cr5,rWORD7,rWORD8
321 cmpld cr0,rWORD1,rWORD2
325 cmpld cr1,rWORD3,rWORD4
327 cmpld cr6,rWORD5,rWORD6
329 cmpld cr5,rWORD7,rWORD8
342 subfic rN,r12,64 /* Shift count is 64 - (rN * 8). */
344 /* At this point we have a remainder of 1 to 7 bytes to compare. Since
345 we are aligned it is safe to load the whole double word, and use
346 shift right double to elliminate bits beyond the compare length. */
352 cmpld cr5,rWORD1,rWORD2
393 beq cr6,L(zeroLength)
395 /* We need to prime this loop. This loop is swing modulo scheduled
396 to avoid pipe delays. The dependent instruction latencies (load to
397 compare to conditional branch) is 2 to 3 cycles. In this loop each
398 dispatch group ends in a branch and takes 1 cycle. Effectively
399 the first iteration of the loop only serves to load operands and
400 branches based on compares are delayed until the next loop.
402 So we must precondition some registers and condition codes so that
403 we don't exit the loop early on the first iteration. */
408 cmpld cr0,rWORD1,rWORD2
412 cmpld cr1,rWORD3,rWORD4
422 cmpld cr6,rWORD5,rWORD6
429 cmpld cr0,rWORD1,rWORD2
436 cmpld cr1,rWORD3,rWORD4
439 /* We speculatively loading bytes before we have tested the previous
440 bytes. But we must avoid overrunning the length (in the ctr) to
441 prevent these speculative loads from causing a segfault. In this
442 case the loop will exit early (before the all pending bytes are
443 tested. In this case we must complete the pending operations
480 sub rRTN,rWORD5,rWORD6
486 sub rRTN,rWORD3,rWORD4
490 sub rRTN,rWORD1,rWORD2
501 /* At this point we know the strings have different alignment and the
502 compare length is at least 8 bytes. rBITDIF containes the low order
503 3 bits of rSTR1 and cr5 contains the result of the logical compare
504 of rBITDIF to 0. If rBITDIF == 0 then rStr1 is double word
505 aligned and can perform the DWunaligned loop.
507 Otherwise we know that rSTR1 is not aready DW aligned yet.
508 So we can force the string addresses to the next lower DW
509 boundary and special case this first DW word using shift left to
510 ellimiate bits preceeding the first byte. Since we want to join the
511 normal (DWaligned) compare loop, starting at the second double word,
512 we need to adjust the length (rN) and special case the loop
513 versioning for the first DW. This insures that the loop count is
514 correct and the first DW (shifted) is in the expected resister pair. */
515 #define rSHL r29 /* Unaligned shift left count. */
516 #define rSHR r28 /* Unaligned shift right count. */
517 #define rB r27 /* Left rotation temp for rWORD2. */
518 #define rD r26 /* Left rotation temp for rWORD4. */
519 #define rF r25 /* Left rotation temp for rWORD6. */
520 #define rH r24 /* Left rotation temp for rWORD8. */
521 #define rA r0 /* Right rotation temp for rWORD2. */
522 #define rC r12 /* Right rotation temp for rWORD4. */
523 #define rE r0 /* Right rotation temp for rWORD6. */
524 #define rG r12 /* Right rotation temp for rWORD8. */
529 beq cr6,L(duzeroLength)
532 beq cr5,L(DWunaligned)
535 /* Adjust the logical start of rSTR2 ro compensate for the extra bits
536 in the 1st rSTR1 DW. */
537 sub r27,rSTR2,rBITDIF
538 /* But do not attempt to address the DW before that DW that contains
539 the actual start of rSTR2. */
543 /* Compute the leaft/right shift counts for the unalign rSTR2,
544 compensating for the logical (DW aligned) start of rSTR1. */
556 srdi rTMP,rN,5 /* Divide by 32 */
557 andi. rBITDIF,rN,24 /* Get the DW remainder */
558 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
559 this special case those bits may be discarded anyway. Also we
560 must avoid loading a DW where none of the bits are part of rSTR2 as
561 this may cross a page boundary and cause a page fault. */
566 sld rWORD8,rWORD8,rSHL
571 cmpldi cr1,rBITDIF,16
585 sld rWORD7,rWORD1,r11
586 sld rWORD8,rWORD8,r11
588 /* At this point we exit early with the first double word compare
589 complete and remainder of 0 to 7 bytes. See L(du14) for details on
590 how we handle the remaining bytes. */
591 cmpld cr5,rWORD7,rWORD8
601 /* Remainder is 16 */
605 sld rWORD5,rWORD1,r11
606 sld rWORD6,rWORD8,r11
608 /* Remainder is 24 */
612 sld rWORD3,rWORD1,r11
613 sld rWORD4,rWORD8,r11
615 /* Count is a multiple of 32, remainder is 0 */
621 sld rWORD1,rWORD1,r11
622 sld rWORD2,rWORD8,r11
625 /* At this point we know rSTR1 is double word aligned and the
626 compare length is at least 8 bytes. */
634 srdi rTMP,rN,5 /* Divide by 32 */
637 andi. rBITDIF,rN,24 /* Get the DW remainder */
643 cmpldi cr1,rBITDIF,16
664 cmpld cr5,rWORD7,rWORD8
670 cmpld cr0,rWORD1,rWORD2
677 cmpld cr1,rWORD3,rWORD4
682 cmpld cr6,rWORD5,rWORD6
685 /* At this point we exit early with the first double word compare
686 complete and remainder of 0 to 7 bytes. See L(du14) for details on
687 how we handle the remaining bytes. */
689 cmpld cr5,rWORD7,rWORD8
699 /* Remainder is 16 */
709 cmpld cr6,rWORD5,rWORD6
716 cmpld cr5,rWORD7,rWORD8
723 cmpld cr0,rWORD1,rWORD2
730 cmpld cr1,rWORD3,rWORD4
734 cmpld cr5,rWORD7,rWORD8
748 /* Remainder is 24 */
758 cmpld cr1,rWORD3,rWORD4
764 cmpld cr6,rWORD5,rWORD6
772 cmpld cr5,rWORD7,rWORD8
779 cmpld cr0,rWORD1,rWORD2
786 cmpld cr5,rWORD7,rWORD8
798 /* Count is a multiple of 32, remainder is 0 */
809 cmpld cr0,rWORD1,rWORD2
815 cmpld cr1,rWORD3,rWORD4
822 cmpld cr6,rWORD5,rWORD6
827 cmpld cr5,rWORD7,rWORD8
828 bdz L(du24) /* Adjust CTR as we start with +4 */
829 /* This is the primary loop */
834 cmpld cr1,rWORD3,rWORD4
842 cmpld cr6,rWORD5,rWORD6
850 cmpld cr5,rWORD7,rWORD8
858 cmpld cr0,rWORD1,rWORD2
867 cmpld cr1,rWORD3,rWORD4
869 cmpld cr6,rWORD5,rWORD6
871 cmpld cr5,rWORD7,rWORD8
881 /* At this point we have a remainder of 1 to 7 bytes to compare. We use
882 shift right double to elliminate bits beyond the compare length.
883 This allows the use of double word subtract to compute the final
886 However it may not be safe to load rWORD2 which may be beyond the
887 string length. So we compare the bit length of the remainder to
888 the right shift count (rSHR). If the bit count is less than or equal
889 we do not need to load rWORD2 (all significant bits are already in
901 subfic rN,rN,64 /* Shift count is 64 - (rN * 8). */
910 cmpld cr0,rWORD1,rWORD2
913 beq cr0,L(dureturn24)
924 bgt cr0,L(dureturn29)
934 bgt cr1,L(dureturn29)
944 bgt cr6,L(dureturn29)
954 bgt cr5,L(dureturn29)
982 END (BP_SYM (memcmp))
983 libc_hidden_builtin_def (memcmp)
984 weak_alias (memcmp,bcmp)