1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
28 The core of the function is the memcpy implementation used in memcpy.S.
29 When bytes have to be copied backwards, only the easy case, when
30 all arguments are multiples of 8, is optimised.
32 In this form, it assumes little endian mode. For big endian mode,
33 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
34 or the UM.be bit should be cleared at the beginning and set at the end. */
63 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
64 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
65 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
68 br.ctop.sptk .loop##shift ; \
69 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
72 alloc saved_pfs = ar.pfs, 3, 29, 0, 32
74 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
76 mov ret0 = in0 // return value = dest
77 mov saved_pr = pr // save the predicate registers
78 mov saved_lc = ar.lc // save the loop counter
79 or tmp3 = in0, in1 ;; // tmp3 = dest | src
80 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
81 mov dest = in0 // dest
84 sub tmp2 = r0, in0 // tmp2 = -dest
85 cmp.eq p6, p0 = in2, r0 // if (len == 0)
86 (p6) br.cond.spnt .restore_and_exit;;// return dest;
87 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
88 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
89 (p6) br.cond.spnt .forward // to copy forward
91 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
92 (p6) br.cond.spnt .backward // we have to copy backward
95 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
96 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
97 (p6) br.cond.sptk .next // goto next;
99 // The optimal case, when dest, src and len are all multiples of 8
102 mov pr.rot = 1 << 16 // set rotating predicates
103 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
104 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
105 adds loopcnt = -1, loopcnt;; // --loopcnt
106 (p6) ld8 value = [src], 8;;
107 (p6) st8 [dest] = value, 8 // copy the "odd" word
108 mov ar.lc = loopcnt // set the loop counter
109 cmp.eq p6, p0 = 8, len
110 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
111 adds adest = 8, dest // set adest one word ahead of dest
112 adds asrc = 8, src ;; // set asrc one word ahead of src
113 nop.b 0 // get the "golden" alignment for
114 nop.b 0 // the next loop
116 (p[0]) ld8 r[0] = [src], 16
117 (p[0]) ld8 q[0] = [asrc], 16
118 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
119 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
122 mov ar.pfs = saved_pfs // restore the PFS
123 mov pr = saved_pr, -1 // restore the predicate registers
124 mov ar.lc = saved_lc // restore the loop counter
127 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
128 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
129 (p6) br.cond.spnt .cpyfew // copy byte by byte
131 cmp.eq p6, p0 = loopcnt, r0
132 (p6) br.cond.sptk .dest_aligned
133 sub len = len, loopcnt // len -= -dest % 8
134 adds loopcnt = -1, loopcnt // --loopcnt
137 .l1: // copy -dest % 8 bytes
138 ld1 value = [src], 1 // value = *src++
140 st1 [dest] = value, 1 // *dest++ = value
143 and sh1 = 7, src // sh1 = src % 8
144 and tmp2 = -8, len // tmp2 = len & -OPSIZ
145 and asrc = -8, src // asrc = src & -OPSIZ -- align src
146 shr.u loopcnt = len, 3 // loopcnt = len / 8
147 and len = 7, len;; // len = len % 8
148 adds loopcnt = -1, loopcnt // --loopcnt
149 addl tmp4 = @ltoff(.table), gp
150 addl tmp3 = @ltoff(.loop56), gp
151 mov ar.ec = MEMLAT + 1 // set EC
152 mov pr.rot = 1 << 16;; // set rotating predicates
153 mov ar.lc = loopcnt // set LC
154 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
155 (p6) br.cond.sptk .src_aligned
156 add src = src, tmp2 // src += len & -OPSIZ
157 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
158 ld8 ploop56 = [tmp3] // ploop56 = &loop56
159 ld8 ptable = [tmp4];; // ptable = &table
160 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
161 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
162 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
163 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
164 ld8 r[1] = [asrc], 8;; // w0
166 br b6 // jump to the appropriate loop
178 (p[0]) ld8 r[0] = [src], 8
179 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
182 cmp.eq p6, p0 = len, r0 // is len == 0 ?
183 adds len = -1, len // --len;
184 (p6) br.cond.spnt .restore_and_exit ;;
189 st1 [dest] = value, 1
192 mov ar.pfs = saved_pfs // restore the PFS
193 mov pr = saved_pr, -1 // restore the predicate registers
194 mov ar.lc = saved_lc // restore the loop counter
197 // In the case of a backward copy, optimise only the case when everything
198 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
199 // used only when the blocks are overlapping and dest > src.
202 shr.u loopcnt = len, 3 // loopcnt = len / 8
203 add src = src, len // src points one byte past the end
204 add dest = dest, len ;; // dest points one byte past the end
205 mov ar.ec = MEMLAT + 1 // set the epilog counter
206 mov pr.rot = 1 << 16 // set rotating predicates
207 adds loopcnt = -1, loopcnt // --loopcnt
208 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
209 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
210 adds src = -8, src // src points to the last word
211 adds dest = -8, dest // dest points to the last word
212 mov ar.lc = loopcnt;; // set the loop counter
214 (p[0]) ld8 r[0] = [src], -8
215 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
217 br.cond.sptk .restore_and_exit
219 adds src = -1, src // src points to the last byte
220 adds dest = -1, dest // dest points to the last byte
221 adds loopcnt = -1, len;; // loopcnt = len - 1
222 mov ar.lc = loopcnt;; // set the loop counter
224 (p[0]) ld1 r[0] = [src], -1
225 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
227 br.cond.sptk .restore_and_exit
229 data8 0 // dummy entry
230 data8 .loop56 - .loop8
231 data8 .loop56 - .loop16
232 data8 .loop56 - .loop24
233 data8 .loop56 - .loop32
234 data8 .loop56 - .loop40
235 data8 .loop56 - .loop48
236 data8 .loop56 - .loop56