1 /* Copyright (C) 2014-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
4 The GNU C Library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
9 The GNU C Library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public
15 License along with the GNU C Library; if not, see
16 <http://www.gnu.org/licenses/>. */
20 /* Implements the functions
22 char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
26 char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
28 The algorithm is as follows:
29 > if src and dest are 8 byte aligned, perform double word copy
31 > copy byte by byte on unaligned addresses.
33 The aligned comparison are made using cmpb instructions. */
35 /* The focus on optimization for performance improvements are as follows:
36 1. data alignment [gain from aligned memory access on read/write]
37 2. POWER7 gains performance with loop unrolling/unwinding
38 [gain by reduction of branch penalty].
39 3. The final pad with null bytes is done by calling an optimized
43 # define FUNC_NAME __stpncpy
45 # define FUNC_NAME strncpy
48 #define FRAMESIZE (FRAME_MIN_SIZE+32)
51 /* For builds with no IFUNC support, local calls should be made to internal
52 GLIBC symbol (created by libc_hidden_builtin_def). */
54 # define MEMSET __GI_memset
56 # define MEMSET memset
61 EALIGN(FUNC_NAME, 4, 0)
64 mflr r0 /* load link register LR to r0 */
65 or r10, r3, r4 /* to verify source and destination */
66 rldicl. r8, r10, 0, 61 /* is double word aligned .. ? */
68 std r19, -8(r1) /* save callers register , r19 */
69 std r18, -16(r1) /* save callers register , r18 */
70 std r0, 16(r1) /* store the link register */
71 stdu r1, -FRAMESIZE(r1) /* create the stack frame */
73 mr r9, r3 /* save r3 into r9 for use */
74 mr r18, r3 /* save r3 for retCode of strncpy */
75 bne 0, L(byte_by_byte)
78 srdi r11, r5, 3 /* compute count for CTR ; count = n/8 */
79 cmpldi cr7, r11, 3 /* if count > 4 ; perform unrolling 4 times */
82 ld r10, 0(r4) /* load doubleWord from src */
83 cmpb r8, r10, r8 /* compare src with NULL ,we read just now */
84 cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
87 std r10, 0(r3) /* copy doubleword at offset=0 */
88 ld r10, 8(r4) /* load next doubleword from offset=8 */
89 cmpb r8, r10, r8 /* compare src with NULL , we read just now */
90 cmpdi cr7, r8, 0 /* if cmpb returned NULL ; we continue */
105 ld r8, 24(r4) /* load dword,perform loop unrolling again */
110 std r8, 24(r7) /* copy dword at offset=24 */
113 bdz L(leftDwords) /* continue with loop on counter */
126 mr r6, r4 /* update values */
132 std r10, 8(r9) /* copy dword at offset=8 */
138 beq cr7, L(dWordUnroll)
140 addi r9, r9, 16 /* increment dst by 16 */
141 addi r4, r4, 16 /* increment src by 16 */
142 addi r5, r5, -16 /* decrement length 'n' by 16 */
143 addi r0, r11, -2 /* decrement loop counter */
146 ld r10, 0(r4) /* load first dword */
147 li r8, 0 /* load mask */
150 bne cr7, L(byte_by_byte)
156 L(loadDWordandCompare):
160 bne cr7, L(byte_by_byte)
167 bdnz L(loadDWordandCompare)
171 ble cr7, L(verifyByte)
179 lbz r10, 1(r4) /* load byte from src */
180 cmpdi cr7, r10, 0 /* compare for NULL */
181 stb r10, 1(r19) /* store byte to dst */
182 beq cr7, L(updtDestComputeN2ndByte)
184 addi r4, r4, 4 /* advance src */
186 lbz r10, -2(r4) /* perform loop unrolling for byte r/w */
189 beq cr7, L(updtDestComputeN3rdByte)
191 lbz r10, -1(r4) /* perform loop unrolling for byte r/w */
195 beq cr7, L(ComputeNByte)
200 lbz r10, 0(r4) /* perform loop unrolling for byte r/w */
203 bne cr7, L(bytes_unroll)
207 subf r9, r19, r9 /* compute 'n'n bytes to fill */
211 cmpdi cr7, r8, 0 /* compare if length is zero */
212 beq cr7, L(update3return)
214 mr r3, r19 /* fill buffer with */
215 li r4, 0 /* zero fill buffer */
216 mr r5, r8 /* how many bytes to fill buffer with */
217 bl MEMSET /* call optimized memset */
221 #ifdef USE_AS_STPNCPY
222 addi r3, r19, -1 /* update return value */
226 #ifndef USE_AS_STPNCPY
227 mr r3, r18 /* set return value */
229 addi r1, r1, FRAMESIZE /* restore stack pointer */
230 ld r0, 16(r1) /* read the saved link register */
231 ld r18, -16(r1) /* restore callers save register, r18 */
232 ld r19, -8(r1) /* restore callers save register, r19 */
233 mtlr r0 /* branch to link register */
242 rldicl. r8, r5, 0, 62
243 #ifdef USE_AS_STPNCPY
246 beq cr0, L(hop2return)
257 lbzu r10, 1(r4) /* copy byte */
267 addi r1, r1, FRAMESIZE /* restore stack pointer */
268 #ifdef USE_AS_STPNCPY
269 mr r3, r19 /* set the return value */
271 mr r3, r18 /* set the return value */
273 ld r0, 16(r1) /* read the saved link register */
274 ld r18, -16(r1) /* restore callers save register, r18 */
275 ld r19, -8(r1) /* restore callers save register, r19 */
276 mtlr r0 /* branch to link register */
287 bne cr7, L(dWordUnrollOFF)
291 L(updtDestComputeN2ndByte):
292 addi r19, r19, 2 /* update dst by 2 */
293 subf r9, r19, r9 /* compute distance covered */
298 L(updtDestComputeN3rdByte):
299 addi r19, r19, 3 /* update dst by 3 */
300 subf r9, r19, r9 /* compute distance covered */
306 addi r9, r9, 24 /* increment dst by 24 */
307 addi r4, r4, 24 /* increment src by 24 */
308 addi r5, r5, -24 /* decrement length 'n' by 24 */
309 addi r0, r11, -3 /* decrement loop counter */
319 addi r9, r7, 40 /* increment dst by 40 */
320 addi r4, r6, 40 /* increment src by 40 */
321 addi r5, r5, -40 /* decrement length 'n' by 40 */
322 addi r0, r11, -5 /* decrement loop counter */
330 addi r9, r3, 8 /* increment dst by 8 */
331 addi r4, r4, 8 /* increment src by 8 */
332 addi r5, r5, -8 /* decrement length 'n' by 8 */
333 addi r0, r11, -1 /* decrement loop counter */
336 #ifndef USE_AS_STPNCPY
337 libc_hidden_builtin_def (strncpy)