1 /* Optimized version of the standard memcpy() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
5 Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
28 An assembly implementation of the algorithm used by the generic C
29 version from glibc. The case when source and sest are aligned is
30 treated separately, for extra performance.
32 In this form, memcpy assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
42 #define LFETCH_DIST 500
44 #define ALIGN_UNROLL_no 4 // no. of elements
45 #define ALIGN_UNROLL_sh 2 // (shift amount)
48 #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
91 #elif defined(USE_INT)
105 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
106 /* Manually force proper loop-alignment. Note: be sure to
107 double-check the code-layout after making any changes to
109 # define ALIGN(n) { nop 0 }
111 # define ALIGN(n) .align n
114 #if defined(USE_LFETCH)
115 #define LOOP(shift) \
119 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
120 (p[0]) lfetch.nt1 [ptr1], 16 ; \
123 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
124 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
127 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
128 (p[0]) lfetch.nt1 [ptr2], 16 ; \
131 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
132 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
133 br.ctop.sptk.many .loop##shift \
136 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
139 #define LOOP(shift) \
143 (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
146 (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
147 (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
150 (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
153 (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
154 (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
155 br.ctop.sptk.many .loop##shift \
158 br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
166 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
167 .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
169 .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
170 mov ret0 = in0 // return tmp2 = dest
172 movi0 saved_pr = pr // save the predicate registers
174 and tmp4 = 7, in0 // check if destination is aligned
175 mov dest = in0 // dest
179 cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
180 .save ar.lc, saved_lc
181 movi0 saved_lc = ar.lc // save the loop counter
183 cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
186 (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
187 (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
190 #if defined(USE_LFETCH)
194 shr.u elemcnt = len, 3 // elemcnt = len / 8
196 cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
197 sub loopcnt = 7, tmp4 //
198 (p_scr) br.cond.dptk.many .dest_aligned
201 ld1 tmp2 = [src], 1 //
202 sub len = len, loopcnt, 1 // reduce len
203 movi0 ar.lc = loopcnt //
205 cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
208 .l0: // ---------------------------- // L0: Align src on 8-byte boundary
210 st1 [dest] = tmp2, 1 //
211 (p_scr) ld1 tmp2 = [src], 1 //
213 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
214 add loopcnt = -1, loopcnt
215 br.cloop.dptk.few .l0 //
220 and tmp4 = 7, src // ready for alignment check
221 shr.u elemcnt = len, 3 // elemcnt = len / 8
224 cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
225 tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
226 } { .mib // is not 16B aligned
227 add ptr2 = LFETCH_DIST, dest // prefetch address
228 add ptr1 = LFETCH_DIST, src
229 (p_scr) br.cond.dptk.many .src_not_aligned
232 // The optimal case, when dest, and src are aligned
236 .pred.rel "mutex",p_xtr,p_nxtr
237 (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
238 (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
239 movi0 pr.rot = 1 << 16 // set rotating predicates
241 (p_scr) br.cond.dpnt.many .copy_full_words
245 (p_xtr) load tempreg = [src], 8
246 (p_xtr) add elemcnt = -1, elemcnt
247 movi0 ar.ec = MEMLAT + 1 // set the epilog counter
250 (p_xtr) add len = -8, len //
251 add asrc = 16, src // one bank apart (for USE_INT)
252 shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
255 add loopcnt = -1, loopcnt
256 (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
261 movi0 ar.lc = loopcnt // set the loop counter
264 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
270 .l1: // ------------------------------- // L1: Everything a multiple of 8
272 #if defined(USE_LFETCH)
273 (p[0]) lfetch.nt1 [ptr2],32
275 (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
276 (p[0]) add len = -32, len
278 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
279 (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
282 #if defined(USE_LFETCH)
283 (p[0]) lfetch.nt1 [ptr1],32
285 (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
287 (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
288 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
289 br.ctop.dptk.many .l1
291 #elif defined(USE_INT)
292 .l1: // ------------------------------- // L1: Everything a multiple of 8
294 (p[0]) load the_r[0] = [src], 8
295 (p[0]) load the_q[0] = [asrc], 8
296 (p[0]) add len = -32, len
298 (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
299 (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
302 (p[0]) load the_s[0] = [src], 24
303 (p[0]) load the_t[0] = [asrc], 24
305 (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
306 (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
307 #if defined(USE_LFETCH)
310 (p[0]) lfetch.nt1 [ptr2],32
311 (p[0]) lfetch.nt1 [ptr1],32
313 br.ctop.dptk.many .l1
319 cmp.gt p_scr, p0 = 8, len //
320 shr.u elemcnt = len, 3 //
321 (p_scr) br.cond.dpnt.many .copy_bytes
324 load tempreg = [src], 8
325 add loopcnt = -1, elemcnt //
328 cmp.ne p_scr, p0 = 0, loopcnt //
329 mov ar.lc = loopcnt //
332 .l2: // ------------------------------- // L2: Max 4 words copied separately
334 store [dest] = tempreg, 8
335 (p_scr) load tempreg = [src], 8 //
338 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
339 add loopcnt = -1, loopcnt
340 br.cloop.dptk.few .l2
345 cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
346 add loopcnt = -1, len // len--;
347 (p_scr) br.cond.spnt .restore_and_exit
351 movi0 ar.lc = loopcnt
352 cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
355 .l3: // ------------------------------- // L3: Final byte move
358 (p_scr) ld1 tmp2 = [src], 1
360 cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
361 add loopcnt = -1, loopcnt
362 br.cloop.dptk.few .l3
367 movi0 pr = saved_pr, -1 // restore the predicate registers
370 movi0 ar.lc = saved_lc // restore the loop counter
377 cmp.gt p_scr, p0 = 16, len
378 and sh1 = 7, src // sh1 = src % 8
379 shr.u loopcnt = len, 4 // element-cnt = len / 16
381 add tmp4 = @ltoff(.table), gp
382 add tmp3 = @ltoff(.loop56), gp
383 (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
386 and asrc = -8, src // asrc = (-8) -- align src for loop
387 add loopcnt = -1, loopcnt // loopcnt--
388 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
390 ld8 ptable = [tmp4] // ptable = &table
391 ld8 ploop56 = [tmp3] // ploop56 = &loop56
392 and tmp2 = -16, len // tmp2 = len & -OPSIZ
395 add tmp3 = ptable, sh1 // tmp3 = &table + sh1
396 add src = src, tmp2 // src += len & (-16)
397 movi0 ar.lc = loopcnt // set LC
400 ld8 tmp4 = [tmp3] // tmp4 = loop offset
401 sub len = len, tmp2 // len -= len & (-16)
402 movi0 ar.ec = MEMLAT + 2 // one more pass needed
405 ld8 s[1] = [asrc], 8 // preload
406 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
407 movi0 pr.rot = 1 << 16 // set rotating predicates
412 br b6 // jump to the appropriate loop
423 libc_hidden_builtin_def (memcpy)
428 data8 0 // dummy entry
429 data8 .loop56 - .loop8
430 data8 .loop56 - .loop16
431 data8 .loop56 - .loop24
432 data8 .loop56 - .loop32
433 data8 .loop56 - .loop40
434 data8 .loop56 - .loop48
435 data8 .loop56 - .loop56