1 /* Optimized strncat implementation for PowerPC64/POWER7.
3 Copyright (C) 2014-2015 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 /* The algorithm is as follows for aligned memory access :
22 if address of s2 is divisible by 0x7UL,
23 perform aligned doubleword catenation
25 perform unaligned catenation
27 The aligned comparison are made using cmpb instructions. */
29 /* char* [r3] strncat (const char *s1 [r3],
37 # define STRNCAT strncat
41 /* For builds with no IFUNC support, local calls should be made to internal
42 GLIBC symbol (created by libc_hidden_builtin_def). */
44 # define STRLEN __GI_strlen
46 # define STRLEN strlen
50 #define FRAMESIZE (FRAME_MIN_SIZE+32)
56 mflr r0 /* Load link register LR to r0. */
58 /* We shall use r29, r30 and r31 non volatile register for retention.
59 Save all the callee registers in the GPR save area. */
60 std r29, -24(r1) /* Save callers register r29. */
61 std r30, -16(r1) /* Save callers register r30. */
62 std r31, -8(r1) /* Save callers register r31. */
64 std r0, 16(r1) /* Store the link register. */
65 stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
67 /* Improve performance with CPU pre-fetch. */
68 dcbt 0, r3 /* Pre-fetch str to avoid cache
70 dcbt 0, r4 /* Pre-fetch accept to avoid cache
73 mr. r29, r5 /* Save "n" in r29. */
74 mr r30, r3 /* Save "s1" in r30 from r3. */
77 mr r31, r4 /* Save "s2" in r31 from r4. */
78 bl STRLEN /* Call optimized strlen on s1; goto
81 cmpldi cr7, r29, 7 /* If s2 is <=7 process
83 add r3, r30, r3 /* Grab the last character of s1. */
84 bgt cr7,L(alignment) /* Process by aligned strings. */
86 cmpldi cr7, r29, 3 /* If n is >= 4, we can
88 addi r9, r3, -1 /* Make "s1" point before next
89 character, increment when read. */
90 bgt cr7, L(bytes_unroll) /* Process each byte. */
95 cmpdi cr7, r10, 0 /* Check for NULL in "s2". */
114 li r10, 0 /* Load NULL for termination. */
115 stb r10, 1(r9) /* Append or terminate s1 with
117 .p2align 4 /* A small section here. */
118 L(done): /* We return now. */
119 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
120 mr r3, r30 /* Set the return value length of
122 ld r0, 16(r1) /* Read the saved link register. */
123 ld r29, -24(r1) /* Restore save register r29. */
124 ld r30, -16(r1) /* Restore save register r30. */
125 ld r31, -8(r1) /* Restore save register r31. */
126 mtlr r0 /* Restore link register. */
127 blr /* Branch to link register. */
131 rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */
132 beq cr0,L(dwordAligned)
135 /* Unaligned bytes in string, so process byte by byte.
136 POWER7 has performance gains over loop unroll. */
144 lbz r10, 1(r31) /* Load byte. */
145 cmpdi cr7, r10, 0 /* Compare ; if byte not zero,
147 stb r10, 2(r9) /* Store byte */
151 lbz r10, -2(r31) /* Perform loop unroll here on byte
157 lbz r10, -1(r31) /* Loop unroll here. */
165 lbz r10, 0(r31) /* Loop unroll here. */
171 /* If s2 is double word aligned, we load and store double word. */
173 /* read, write 8 bytes at a time */
174 srdi r8, r29, 3 /* Compute count for CTR to loop;
176 li r7, 0 /* Load r7 with NULL. */
177 li r10, 0 /* Load r10 with MASK '0'. */
179 mtctr r8 /* Move count to CTR. */
181 ld r9, 0(r31) /* Read double word from s2. */
182 cmpb r6, r9, r10 /* Compare bytes in s2 we read
184 cmpdi r6, 0 /* If cmpb returned NULL,
187 std r9, 0(r3) /* Append double word from s2
189 addi r3, r3, 8 /* Increment s1. */
190 addi r31, r31, 8 /* Increment s2. */
191 subi r29, r29, 8 /* Decrement count by 8. */
192 bdnz L(loop8) /* Continue until "count" is
196 cmpdi r29, 0 /* If "n" is already zero, we skip. */
199 mtctr r29 /* Process left over bytes in "n". */
201 lbz r9, 0(r31) /* Read a byte from s2. */
202 cmpw r9, r7 /* If byte is NULL, we stop here . */
203 beq+ L(align8align) /* Skip processing further if NULL. */
204 stb r9, 0(r3) /* If not NULL, store byte into s1. */
205 addi r3, r3, 1 /* Increment s1 by 1. */
206 addi r31, r31, 1 /* Increment s2 by 1. */
207 bdnz L(unaligned0) /* Decrement counter "n" and loop
210 stb r7, 0(r3) /* Terminate s1 with NULL. */
212 addi r1, r1, FRAMESIZE /* Restore stack pointer. */
213 mr r3, r30 /* Set the return value, length of
215 ld r0, 16(r1) /* Read the saved link register. */
216 ld r29, -24(r1) /* Restore save register r29. */
217 ld r30, -16(r1) /* Restore save register r30. */
218 ld r31, -8(r1) /* Restore save register r31. */
219 mtlr r0 /* Restore link register. */
220 blr /* Branch to link register */
224 rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */
225 bne cr0,L(byte_by_byte) /* Process bytes one by one. */
226 b L(nullTerminate) /* Now, finish catenation with