1 /* Optimized memcpy implementation for POWER10.
2 Copyright (C) 2021-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 # define MEMCPY memcpy
26 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
30 ENTRY_TOCLESS (MEMCPY, 5)
33 /* Copy up to 16 bytes. */
34 sldi r6,r5,56 /* Prepare [l|st]xvl counter. */
37 subic. r6,r5,16 /* Return if len <= 16. */
40 /* If len >= 256, assume nothing got copied before and copy
41 again. This might cause issues with overlapped memory, but memcpy
42 is not expected to treat overlapped memory. */
45 /* 16 < len < 256 and the first 16 bytes have already been copied. */
46 addi r10,r3,16 /* Keep r3 intact as return value. */
49 b L(copy_lt_256) /* Avoid the main loop if len < 256. */
53 mr r10,r3 /* Keep r3 intact as return value. */
54 /* Align dst to 16 bytes. */
56 beq L(dst_is_align_16)
65 srdi r9,r5,7 /* Divide by 128. */
71 /* Main loop, copy 128 bytes per iteration.
72 Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
107 clrldi. r5,r5,64-7 /* Have we copied everything? */
114 srdi. r9,r5,5 /* Divide by 32. */
117 /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
118 the dependency on r4 and r10. */
125 /* Copy 32 bytes at a time, unaligned.
126 The loop is unrolled 3 times in order to reduce the dependency on
127 r4 and r10, copying up-to 96 bytes per iteration. */
153 clrldi. r5,r5,64-5 /* Have we copied everything? */
161 clrldi. r5,r5,64-5 /* Have we copied everything? */
163 /* 32 bytes have been copied since the last update of r4 and r10. */
172 clrldi. r5,r5,64-5 /* Have we copied everything? */
174 /* The last iteration of the loop copied 64 bytes. Update r4 and r10
197 END_GEN_TB (MEMCPY,TB_TOCLESS)
198 libc_hidden_builtin_def (memcpy)