1 /* Optimized version of the standard memcpy() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2023 Free Software Foundation, Inc.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
26 An assembly implementation of the algorithm used by the generic C
27 version from glibc. The case when source and sest are aligned is
28 treated separately, for extra performance.
30 In this form, memcpy assumes little endian mode. For big endian mode,
31 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
32 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
40 #define LFETCH_DIST 500
42 #define ALIGN_UNROLL_no 4 // no. of elements
43 #define ALIGN_UNROLL_sh 2 // (shift amount)
46 #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
89 #elif defined(USE_INT)
103 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
104 /* Manually force proper loop-alignment. Note: be sure to
105 double-check the code-layout after making any changes to
107 # define ALIGN(n) { nop 0 }
109 # define ALIGN(n) .align n
112 #if defined(USE_LFETCH)
113 #define LOOP(shift) \
117 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
118 (p[0]) lfetch.nt1 [ptr1], 16 ; \
121 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
122 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
125 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
126 (p[0]) lfetch.nt1 [ptr2], 16 ; \
129 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
130 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
131 br.ctop.sptk.many .loop##shift \
134 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
137 #define LOOP(shift) \
141 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
144 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
145 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
148 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
151 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
152 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
153 br.ctop.sptk.many .loop##shift \
156 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
164 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
165 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
167 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
168 mov ret0 = in0 // return tmp2 = dest
170 movi0 saved_pr = pr // save the predicate registers
172 and tmp4 = 7, in0 // check if destination is aligned
173 mov dest = in0 // dest
177 cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
178 .save ar.lc, saved_lc
179 movi0 saved_lc = ar.lc // save the loop counter
181 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
184 (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
185 (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
188 #if defined(USE_LFETCH)
192 shr.u elemcnt = len, 3 // elemcnt = len / 8
194 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
195 sub loopcnt = 7, tmp4 //
196 (p_scr) br.cond.dptk.many .dest_aligned
199 ld1 tmp2 = [src], 1 //
200 sub len = len, loopcnt, 1 // reduce len
201 movi0 ar.lc = loopcnt //
203 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
206 .l0: // ---------------------------- // L0: Align src on 8-byte boundary
208 st1 [dest] = tmp2, 1 //
209 (p_scr) ld1 tmp2 = [src], 1 //
211 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
212 add loopcnt = -1, loopcnt
213 br.cloop.dptk.few .l0 //
218 and tmp4 = 7, src // ready for alignment check
219 shr.u elemcnt = len, 3 // elemcnt = len / 8
222 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
223 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
224 } { .mib // is not 16B aligned
225 add ptr2 = LFETCH_DIST, dest // prefetch address
226 add ptr1 = LFETCH_DIST, src
227 (p_scr) br.cond.dptk.many .src_not_aligned
230 // The optimal case, when dest, and src are aligned
234 .pred.rel "mutex",p_xtr,p_nxtr
235 (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
236 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
237 movi0 pr.rot = 1 << 16 // set rotating predicates
239 (p_scr) br.cond.dpnt.many .copy_full_words
243 (p_xtr) load tempreg = [src], 8
244 (p_xtr) add elemcnt = -1, elemcnt
245 movi0 ar.ec = MEMLAT + 1 // set the epilog counter
248 (p_xtr) add len = -8, len //
249 add asrc = 16, src // one bank apart (for USE_INT)
250 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
253 add loopcnt = -1, loopcnt
254 (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
259 movi0 ar.lc = loopcnt // set the loop counter
262 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
268 .l1: // ------------------------------- // L1: Everything a multiple of 8
270 #if defined(USE_LFETCH)
271 (p[0]) lfetch.nt1 [ptr2],32
273 (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
274 (p[0]) add len = -32, len
276 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
277 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
280 #if defined(USE_LFETCH)
281 (p[0]) lfetch.nt1 [ptr1],32
283 (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
285 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
286 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
287 br.ctop.dptk.many .l1
289 #elif defined(USE_INT)
290 .l1: // ------------------------------- // L1: Everything a multiple of 8
292 (p[0]) load the_r[0] = [src], 8
293 (p[0]) load the_q[0] = [asrc], 8
294 (p[0]) add len = -32, len
296 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
297 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
300 (p[0]) load the_s[0] = [src], 24
301 (p[0]) load the_t[0] = [asrc], 24
303 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
304 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
305 #if defined(USE_LFETCH)
308 (p[0]) lfetch.nt1 [ptr2],32
309 (p[0]) lfetch.nt1 [ptr1],32
311 br.ctop.dptk.many .l1
317 cmp.gt p_scr, p0 = 8, len //
318 shr.u elemcnt = len, 3 //
319 (p_scr) br.cond.dpnt.many .copy_bytes
322 load tempreg = [src], 8
323 add loopcnt = -1, elemcnt //
326 cmp.ne p_scr, p0 = 0, loopcnt //
327 mov ar.lc = loopcnt //
330 .l2: // ------------------------------- // L2: Max 4 words copied separately
332 store [dest] = tempreg, 8
333 (p_scr) load tempreg = [src], 8 //
336 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
337 add loopcnt = -1, loopcnt
338 br.cloop.dptk.few .l2
343 cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
344 add loopcnt = -1, len // len--;
345 (p_scr) br.cond.spnt .restore_and_exit
349 movi0 ar.lc = loopcnt
350 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
353 .l3: // ------------------------------- // L3: Final byte move
356 (p_scr) ld1 tmp2 = [src], 1
358 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
359 add loopcnt = -1, loopcnt
360 br.cloop.dptk.few .l3
365 movi0 pr = saved_pr, -1 // restore the predicate registers
368 movi0 ar.lc = saved_lc // restore the loop counter
375 cmp.gt p_scr, p0 = 16, len
376 and sh1 = 7, src // sh1 = src % 8
377 shr.u loopcnt = len, 4 // element-cnt = len / 16
379 add tmp4 = @ltoff(.table), gp
380 add tmp3 = @ltoff(.loop56), gp
381 (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
384 and asrc = -8, src // asrc = (-8) -- align src for loop
385 add loopcnt = -1, loopcnt // loopcnt--
386 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
388 ld8 ptable = [tmp4] // ptable = &table
389 ld8 ploop56 = [tmp3] // ploop56 = &loop56
390 and tmp2 = -16, len // tmp2 = len & -OPSIZ
393 add tmp3 = ptable, sh1 // tmp3 = &table + sh1
394 add src = src, tmp2 // src += len & (-16)
395 movi0 ar.lc = loopcnt // set LC
398 ld8 tmp4 = [tmp3] // tmp4 = loop offset
399 sub len = len, tmp2 // len -= len & (-16)
400 movi0 ar.ec = MEMLAT + 2 // one more pass needed
403 ld8 s[1] = [asrc], 8 // preload
404 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
405 movi0 pr.rot = 1 << 16 // set rotating predicates
410 br b6 // jump to the appropriate loop
421 libc_hidden_builtin_def (memcpy)
426 data8 0 // dummy entry
427 data8 .loop56 - .loop8
428 data8 .loop56 - .loop16
429 data8 .loop56 - .loop24
430 data8 .loop56 - .loop32
431 data8 .loop56 - .loop40
432 data8 .loop56 - .loop48
433 data8 .loop56 - .loop56