1 /* Optimized memcpy implementation for PowerPC A2.
2 Copyright (C) 2010-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include <rtld-global-offsets.h>
23 # define MEMCPY memcpy
26 #define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */
27 #define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */
30 __GLRO_DEF(dl_cache_line_size)
41 dcbt 0,r4 /* Prefetch ONE SRC cacheline */
42 cmpldi cr1,r5,16 /* is size < 16 ? */
43 mr r6,r3 /* Copy dest reg to r6; */
47 /* Big copy (16 bytes or more)
49 Figure out how far to the nearest quadword boundary, or if we are
50 on one already. Also get the cache line size.
52 r3 - return value (always)
53 r4 - current source addr
55 r6 - current dest addr
58 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */
59 /* Get the cache line size. */
60 __GLRO (r9, dl_cache_line_size,
61 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET)
62 clrldi r8,r8,64-4 /* align to 16byte boundary */
63 sub r7,r4,r3 /* compute offset to src from dest */
64 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */
65 addi r10,r9,-1 /* Cache line mask */
70 /* Destination is not aligned on quadword boundary. Get us to one.
72 r3 - return value (always)
73 r4 - current source addr
75 r6 - current dest addr
76 r7 - offset to src from dest
77 r8 - number of bytes to quadword boundary
80 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
81 subf r5,r8,r5 /* adjust remaining len */
84 lbzx r0,r7,r6 /* copy 1 byte addr */
89 lhzx r0,r7,r6 /* copy 2 byte addr */
94 lwzx r0,r7,r6 /* copy 4 byte addr */
99 ldx r0,r7,r6 /* copy 8 byte addr */
103 add r4,r7,r6 /* update src addr */
107 /* Dest is quadword aligned now.
109 Lots of decisions to make. If we are copying less than a cache
110 line we won't be here long. If we are not on a cache line
111 boundary we need to get there. And then we need to figure out
112 how many cache lines ahead to pre-touch.
114 r3 - return value (always)
115 r4 - current source addr
117 r6 - current dest addr
123 cmpdi cr0,r9,0 /* Cache line size set? */
124 bne+ cr0,L(cachelineset)
126 /* Cache line size not set: generic byte copy without much optimization */
127 clrldi. r0,r5,63 /* If length is odd copy one byte */
128 beq L(cachelinenotset_align)
129 lbz r7,0(r4) /* Read one byte from source */
130 addi r5,r5,-1 /* Update length */
131 addi r4,r4,1 /* Update source pointer address */
132 stb r7,0(r6) /* Store one byte at dest */
133 addi r6,r6,1 /* Update dest pointer address */
134 L(cachelinenotset_align):
135 cmpdi cr7,r5,0 /* If length is 0 return */
137 ori r2,r2,0 /* Force a new dispatch group */
138 L(cachelinenotset_loop):
139 addic. r5,r5,-2 /* Update length */
140 lbz r7,0(r4) /* Load 2 bytes from source */
142 addi r4,r4,2 /* Update source pointer address */
143 stb r7,0(r6) /* Store 2 bytes on dest */
145 addi r6,r6,2 /* Update dest pointer address */
146 bne L(cachelinenotset_loop)
151 cmpd cr5,r5,r10 /* Less than a cacheline to go? */
153 neg r7,r6 /* How far to next cacheline bdy? */
155 addi r6,r6,-8 /* prepare for stdu */
157 addi r4,r4,-8 /* prepare for ldu */
160 ble+ cr5,L(lessthancacheline)
162 beq- cr0,L(big_lines) /* 128 byte line code */
166 /* More than a cacheline left to go, and using 64 byte cachelines */
168 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */
170 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
172 /* Reduce total len by what it takes to get to the next cache line */
174 srdi r7,r7,4 /* How many qws to get to the line bdy? */
176 /* How many full cache lines to copy after getting to a line bdy? */
179 cmpldi r10,0 /* If no full cache lines to copy ... */
180 li r11,0 /* number cachelines to copy with prefetch */
181 beq L(nocacheprefetch)
184 /* We are here because we have at least one full cache line to copy,
185 and therefore some pre-touching to do. */
187 cmpldi r10,PREFETCH_AHEAD
188 li r12,64+8 /* prefetch distance */
189 ble L(lessthanmaxprefetch)
191 /* We can only do so much pre-fetching. R11 will have the count of
192 lines left to prefetch after the initial batch of prefetches
195 subi r11,r10,PREFETCH_AHEAD
196 li r10,PREFETCH_AHEAD
198 L(lessthanmaxprefetch):
201 /* At this point r10/ctr hold the number of lines to prefetch in this
202 initial batch, and r11 holds any remainder. */
210 /* Prefetching is done, or was not needed.
212 cr6 - are we on a cacheline boundary already?
213 r7 - number of quadwords to the next cacheline boundary
219 cmpldi cr1,r5,64 /* Less than a cache line to copy? */
221 /* How many bytes are left after we copy whatever full
222 cache lines we can get? */
225 beq cr6,L(cachelinealigned)
228 /* Copy quadwords up to the next cacheline boundary */
236 bdnz L(aligntocacheline)
240 L(cachelinealigned): /* copy while cache lines */
242 blt- cr1,L(lessthancacheline) /* size <64 */
249 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */
252 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
253 L(loop): /* Copy aligned body */
254 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
282 L(loop2): /* Copy aligned body */
306 L(lessthancacheline): /* Was there less than cache to do ? */
308 srdi r7,r5,4 /* divide size by 16 */
318 bdnz L(copy_remaining)
320 L(do_lt16): /* less than 16 ? */
321 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
322 beqlr+ /* no rest to copy */
326 L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */
330 ldx r0,r7,r6 /* copy 8 byte */
335 lwzx r0,r7,r6 /* copy 4 byte */
340 lhzx r0,r7,r6 /* copy 2 byte */
345 lbzx r0,r7,r6 /* copy 1 byte */
354 /* Similar to above, but for use with 128 byte lines. */
359 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */
361 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */
363 /* Reduce total len by what it takes to get to the next cache line */
365 srdi r7,r7,4 /* How many qws to get to the line bdy? */
367 /* How many full cache lines to copy after getting to a line bdy? */
370 cmpldi r10,0 /* If no full cache lines to copy ... */
371 li r11,0 /* number cachelines to copy with prefetch */
372 beq L(nocacheprefetch_128)
375 /* We are here because we have at least one full cache line to copy,
376 and therefore some pre-touching to do. */
378 cmpldi r10,PREFETCH_AHEAD
379 li r12,128+8 /* prefetch distance */
380 ble L(lessthanmaxprefetch_128)
382 /* We can only do so much pre-fetching. R11 will have the count of
383 lines left to prefetch after the initial batch of prefetches
386 subi r11,r10,PREFETCH_AHEAD
387 li r10,PREFETCH_AHEAD
389 L(lessthanmaxprefetch_128):
392 /* At this point r10/ctr hold the number of lines to prefetch in this
393 initial batch, and r11 holds any remainder. */
398 bdnz L(prefetchSRC_128)
401 /* Prefetching is done, or was not needed.
403 cr6 - are we on a cacheline boundary already?
404 r7 - number of quadwords to the next cacheline boundary
407 L(nocacheprefetch_128):
410 cmpldi cr1,r5,128 /* Less than a cache line to copy? */
412 /* How many bytes are left after we copy whatever full
413 cache lines we can get? */
416 beq cr6,L(cachelinealigned_128)
419 /* Copy quadwords up to the next cacheline boundary */
421 L(aligntocacheline_128):
427 bdnz L(aligntocacheline_128)
430 L(cachelinealigned_128): /* copy while cache lines */
432 blt- cr1,L(lessthancacheline) /* size <128 */
439 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
442 /* Copy whole cachelines, optimized by prefetching SRC cacheline */
443 L(loop_128): /* Copy aligned body */
444 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
488 L(loop2_128): /* Copy aligned body */
526 b L(lessthancacheline)
529 END_GEN_TB (MEMCPY,TB_TOCLESS)
530 libc_hidden_builtin_def (memcpy)