1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28 The core of the function is the memcpy implementation used in memcpy.S.
29 When bytes have to be copied backwards, only the easy case, when
30 all arguments are multiples of 8, is optimised.
32 In this form, it assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 or the UM.be bit should be cleared at the beginning and set at the end. */
62 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
63 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
64 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
67 br.ctop.sptk .loop##shift ; \
68 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
71 #define Nrot (((2*MEMLAT+3) + 7) & ~7)
75 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
76 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
78 mov ret0 = in0 // return value = dest
80 mov saved_pr = pr // save the predicate registers
82 mov saved_lc = ar.lc // save the loop counter
84 or tmp3 = in0, in1 ;; // tmp3 = dest | src
85 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
86 mov dest = in0 // dest
89 sub tmp2 = r0, in0 // tmp2 = -dest
90 cmp.eq p6, p0 = in2, r0 // if (len == 0)
91 (p6) br.cond.spnt .restore_and_exit;;// return dest;
92 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
93 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
94 (p6) br.cond.spnt .forward // to copy forward
96 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
97 (p6) br.cond.spnt .backward // we have to copy backward
100 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
101 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
102 (p6) br.cond.sptk .next // goto next;
104 // The optimal case, when dest, src and len are all multiples of 8
107 mov pr.rot = 1 << 16 // set rotating predicates
108 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
109 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
110 adds loopcnt = -1, loopcnt;; // --loopcnt
111 (p6) ld8 value = [src], 8;;
112 (p6) st8 [dest] = value, 8 // copy the "odd" word
113 mov ar.lc = loopcnt // set the loop counter
114 cmp.eq p6, p0 = 8, len
115 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
116 adds adest = 8, dest // set adest one word ahead of dest
117 adds asrc = 8, src ;; // set asrc one word ahead of src
118 nop.b 0 // get the "golden" alignment for
119 nop.b 0 // the next loop
121 (p[0]) ld8 r[0] = [src], 16
122 (p[0]) ld8 q[0] = [asrc], 16
123 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
124 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
127 mov pr = saved_pr, -1 // restore the predicate registers
128 mov ar.lc = saved_lc // restore the loop counter
131 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
132 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
133 (p6) br.cond.spnt .cpyfew // copy byte by byte
135 cmp.eq p6, p0 = loopcnt, r0
136 (p6) br.cond.sptk .dest_aligned
137 sub len = len, loopcnt // len -= -dest % 8
138 adds loopcnt = -1, loopcnt // --loopcnt
141 .l1: // copy -dest % 8 bytes
142 ld1 value = [src], 1 // value = *src++
144 st1 [dest] = value, 1 // *dest++ = value
147 and sh1 = 7, src // sh1 = src % 8
148 and tmp2 = -8, len // tmp2 = len & -OPSIZ
149 and asrc = -8, src // asrc = src & -OPSIZ -- align src
150 shr.u loopcnt = len, 3 // loopcnt = len / 8
151 and len = 7, len;; // len = len % 8
152 adds loopcnt = -1, loopcnt // --loopcnt
153 addl tmp4 = @ltoff(.table), gp
154 addl tmp3 = @ltoff(.loop56), gp
155 mov ar.ec = MEMLAT + 1 // set EC
156 mov pr.rot = 1 << 16;; // set rotating predicates
157 mov ar.lc = loopcnt // set LC
158 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
159 (p6) br.cond.sptk .src_aligned
160 add src = src, tmp2 // src += len & -OPSIZ
161 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
162 ld8 ploop56 = [tmp3] // ploop56 = &loop56
163 ld8 ptable = [tmp4];; // ptable = &table
164 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
165 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
166 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
167 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
168 ld8 r[1] = [asrc], 8;; // w0
170 br b6 // jump to the appropriate loop
182 (p[0]) ld8 r[0] = [src], 8
183 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
186 cmp.eq p6, p0 = len, r0 // is len == 0 ?
187 adds len = -1, len // --len;
188 (p6) br.cond.spnt .restore_and_exit ;;
193 st1 [dest] = value, 1
196 mov pr = saved_pr, -1 // restore the predicate registers
197 mov ar.lc = saved_lc // restore the loop counter
200 // In the case of a backward copy, optimise only the case when everything
201 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
202 // used only when the blocks are overlapping and dest > src.
205 shr.u loopcnt = len, 3 // loopcnt = len / 8
206 add src = src, len // src points one byte past the end
207 add dest = dest, len ;; // dest points one byte past the end
208 mov ar.ec = MEMLAT + 1 // set the epilog counter
209 mov pr.rot = 1 << 16 // set rotating predicates
210 adds loopcnt = -1, loopcnt // --loopcnt
211 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
212 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
213 adds src = -8, src // src points to the last word
214 adds dest = -8, dest // dest points to the last word
215 mov ar.lc = loopcnt;; // set the loop counter
217 (p[0]) ld8 r[0] = [src], -8
218 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
220 br.cond.sptk .restore_and_exit
222 adds src = -1, src // src points to the last byte
223 adds dest = -1, dest // dest points to the last byte
224 adds loopcnt = -1, len;; // loopcnt = len - 1
225 mov ar.lc = loopcnt;; // set the loop counter
227 (p[0]) ld1 r[0] = [src], -1
228 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
230 br.cond.sptk .restore_and_exit
232 data8 0 // dummy entry
233 data8 .loop56 - .loop8
234 data8 .loop56 - .loop16
235 data8 .loop56 - .loop24
236 data8 .loop56 - .loop32
237 data8 .loop56 - .loop40
238 data8 .loop56 - .loop48
239 data8 .loop56 - .loop56
242 libc_hidden_builtin_def (memmove)