1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 The core of the function is the memcpy implementation used in memcpy.S.
29 When bytes have to be copied backwards, only the easy case, when
30 all arguments are multiples of 8, is optimised.
32 In this form, it assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 or the UM.be bit should be cleared at the beginning and set at the end. */
59 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
60 # define ALIGN(n) { nop 0 }
62 # define ALIGN(n) .align n
68 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
69 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
70 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
73 br.ctop.sptk .loop##shift ; \
74 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
77 #define Nrot (((2*MEMLAT+3) + 7) & ~7)
81 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
82 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
84 mov ret0 = in0 // return value = dest
86 mov saved_pr = pr // save the predicate registers
88 mov saved_lc = ar.lc // save the loop counter
90 or tmp3 = in0, in1 ;; // tmp3 = dest | src
91 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
92 mov dest = in0 // dest
95 sub tmp2 = r0, in0 // tmp2 = -dest
96 cmp.eq p6, p0 = in2, r0 // if (len == 0)
97 (p6) br.cond.spnt .restore_and_exit;;// return dest;
98 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
99 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
100 (p6) br.cond.spnt .forward // to copy forward
101 add tmp3 = src, len;;
102 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
103 (p6) br.cond.spnt .backward // we have to copy backward
106 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
107 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
108 (p6) br.cond.sptk .next // goto next;
110 // The optimal case, when dest, src and len are all multiples of 8
113 mov pr.rot = 1 << 16 // set rotating predicates
114 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
115 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
116 adds loopcnt = -1, loopcnt;; // --loopcnt
117 (p6) ld8 value = [src], 8;;
118 (p6) st8 [dest] = value, 8 // copy the "odd" word
119 mov ar.lc = loopcnt // set the loop counter
120 cmp.eq p6, p0 = 8, len
121 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
122 adds adest = 8, dest // set adest one word ahead of dest
123 adds asrc = 8, src ;; // set asrc one word ahead of src
124 nop.b 0 // get the "golden" alignment for
125 nop.b 0 // the next loop
127 (p[0]) ld8 r[0] = [src], 16
128 (p[0]) ld8 q[0] = [asrc], 16
129 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
130 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
133 mov pr = saved_pr, -1 // restore the predicate registers
134 mov ar.lc = saved_lc // restore the loop counter
137 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
138 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
139 (p6) br.cond.spnt .cpyfew // copy byte by byte
141 cmp.eq p6, p0 = loopcnt, r0
142 (p6) br.cond.sptk .dest_aligned
143 sub len = len, loopcnt // len -= -dest % 8
144 adds loopcnt = -1, loopcnt // --loopcnt
147 .l1: // copy -dest % 8 bytes
148 ld1 value = [src], 1 // value = *src++
150 st1 [dest] = value, 1 // *dest++ = value
153 and sh1 = 7, src // sh1 = src % 8
154 and tmp2 = -8, len // tmp2 = len & -OPSIZ
155 and asrc = -8, src // asrc = src & -OPSIZ -- align src
156 shr.u loopcnt = len, 3 // loopcnt = len / 8
157 and len = 7, len;; // len = len % 8
158 adds loopcnt = -1, loopcnt // --loopcnt
159 addl tmp4 = @ltoff(.table), gp
160 addl tmp3 = @ltoff(.loop56), gp
161 mov ar.ec = MEMLAT + 1 // set EC
162 mov pr.rot = 1 << 16;; // set rotating predicates
163 mov ar.lc = loopcnt // set LC
164 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
165 (p6) br.cond.sptk .src_aligned
166 add src = src, tmp2 // src += len & -OPSIZ
167 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
168 ld8 ploop56 = [tmp3] // ploop56 = &loop56
169 ld8 ptable = [tmp4];; // ptable = &table
170 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
171 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
172 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
173 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
174 ld8 r[1] = [asrc], 8;; // w0
176 br b6 // jump to the appropriate loop
188 (p[0]) ld8 r[0] = [src], 8
189 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
192 cmp.eq p6, p0 = len, r0 // is len == 0 ?
193 adds len = -1, len // --len;
194 (p6) br.cond.spnt .restore_and_exit ;;
199 st1 [dest] = value, 1
202 mov pr = saved_pr, -1 // restore the predicate registers
203 mov ar.lc = saved_lc // restore the loop counter
206 // In the case of a backward copy, optimise only the case when everything
207 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
208 // used only when the blocks are overlapping and dest > src.
211 shr.u loopcnt = len, 3 // loopcnt = len / 8
212 add src = src, len // src points one byte past the end
213 add dest = dest, len ;; // dest points one byte past the end
214 mov ar.ec = MEMLAT + 1 // set the epilog counter
215 mov pr.rot = 1 << 16 // set rotating predicates
216 adds loopcnt = -1, loopcnt // --loopcnt
217 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
218 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
219 adds src = -8, src // src points to the last word
220 adds dest = -8, dest // dest points to the last word
221 mov ar.lc = loopcnt;; // set the loop counter
223 (p[0]) ld8 r[0] = [src], -8
224 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
226 br.cond.sptk .restore_and_exit
228 adds src = -1, src // src points to the last byte
229 adds dest = -1, dest // dest points to the last byte
230 adds loopcnt = -1, len;; // loopcnt = len - 1
231 mov ar.lc = loopcnt;; // set the loop counter
233 (p[0]) ld1 r[0] = [src], -1
234 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
236 br.cond.sptk .restore_and_exit
242 data8 0 // dummy entry
243 data8 .loop56 - .loop8
244 data8 .loop56 - .loop16
245 data8 .loop56 - .loop24
246 data8 .loop56 - .loop32
247 data8 .loop56 - .loop40
248 data8 .loop56 - .loop48
249 data8 .loop56 - .loop56
251 libc_hidden_builtin_def (memmove)