1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2024 Free Software Foundation, Inc.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
26 The core of the function is the memcpy implementation used in memcpy.S.
27 When bytes have to be copied backwards, only the easy case, when
28 all arguments are multiples of 8, is optimised.
30 In this form, it assumes little endian mode. For big endian mode,
31 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
32 or the UM.be bit should be cleared at the beginning and set at the end. */
57 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
58 # define ALIGN(n) { nop 0 }
60 # define ALIGN(n) .align n
66 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
67 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
68 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
71 br.ctop.sptk .loop##shift ; \
72 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
75 #define Nrot (((2*MEMLAT+3) + 7) & ~7)
79 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
80 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
82 mov ret0 = in0 // return value = dest
84 mov saved_pr = pr // save the predicate registers
86 mov saved_lc = ar.lc // save the loop counter
88 or tmp3 = in0, in1 ;; // tmp3 = dest | src
89 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
90 mov dest = in0 // dest
93 sub tmp2 = r0, in0 // tmp2 = -dest
94 cmp.eq p6, p0 = in2, r0 // if (len == 0)
95 (p6) br.cond.spnt .restore_and_exit;;// return dest;
96 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
97 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
98 (p6) br.cond.spnt .forward // to copy forward
100 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
101 (p6) br.cond.spnt .backward // we have to copy backward
104 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
105 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
106 (p6) br.cond.sptk .next // goto next;
108 // The optimal case, when dest, src and len are all multiples of 8
111 mov pr.rot = 1 << 16 // set rotating predicates
112 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
113 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
114 adds loopcnt = -1, loopcnt;; // --loopcnt
115 (p6) ld8 value = [src], 8;;
116 (p6) st8 [dest] = value, 8 // copy the "odd" word
117 mov ar.lc = loopcnt // set the loop counter
118 cmp.eq p6, p0 = 8, len
119 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
120 adds adest = 8, dest // set adest one word ahead of dest
121 adds asrc = 8, src ;; // set asrc one word ahead of src
122 nop.b 0 // get the "golden" alignment for
123 nop.b 0 // the next loop
125 (p[0]) ld8 r[0] = [src], 16
126 (p[0]) ld8 q[0] = [asrc], 16
127 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
128 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
131 mov pr = saved_pr, -1 // restore the predicate registers
132 mov ar.lc = saved_lc // restore the loop counter
135 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
136 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
137 (p6) br.cond.spnt .cpyfew // copy byte by byte
139 cmp.eq p6, p0 = loopcnt, r0
140 (p6) br.cond.sptk .dest_aligned
141 sub len = len, loopcnt // len -= -dest % 8
142 adds loopcnt = -1, loopcnt // --loopcnt
145 .l1: // copy -dest % 8 bytes
146 ld1 value = [src], 1 // value = *src++
148 st1 [dest] = value, 1 // *dest++ = value
151 and sh1 = 7, src // sh1 = src % 8
152 and tmp2 = -8, len // tmp2 = len & -OPSIZ
153 and asrc = -8, src // asrc = src & -OPSIZ -- align src
154 shr.u loopcnt = len, 3 // loopcnt = len / 8
155 and len = 7, len;; // len = len % 8
156 adds loopcnt = -1, loopcnt // --loopcnt
157 addl tmp4 = @ltoff(.table), gp
158 addl tmp3 = @ltoff(.loop56), gp
159 mov ar.ec = MEMLAT + 1 // set EC
160 mov pr.rot = 1 << 16;; // set rotating predicates
161 mov ar.lc = loopcnt // set LC
162 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
163 (p6) br.cond.sptk .src_aligned
164 add src = src, tmp2 // src += len & -OPSIZ
165 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
166 ld8 ploop56 = [tmp3] // ploop56 = &loop56
167 ld8 ptable = [tmp4];; // ptable = &table
168 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
169 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
170 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
171 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
172 ld8 r[1] = [asrc], 8;; // w0
174 br b6 // jump to the appropriate loop
186 (p[0]) ld8 r[0] = [src], 8
187 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
190 cmp.eq p6, p0 = len, r0 // is len == 0 ?
191 adds len = -1, len // --len;
192 (p6) br.cond.spnt .restore_and_exit ;;
197 st1 [dest] = value, 1
200 mov pr = saved_pr, -1 // restore the predicate registers
201 mov ar.lc = saved_lc // restore the loop counter
204 // In the case of a backward copy, optimise only the case when everything
205 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
206 // used only when the blocks are overlapping and dest > src.
209 shr.u loopcnt = len, 3 // loopcnt = len / 8
210 add src = src, len // src points one byte past the end
211 add dest = dest, len ;; // dest points one byte past the end
212 mov ar.ec = MEMLAT + 1 // set the epilog counter
213 mov pr.rot = 1 << 16 // set rotating predicates
214 adds loopcnt = -1, loopcnt // --loopcnt
215 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
216 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
217 adds src = -8, src // src points to the last word
218 adds dest = -8, dest // dest points to the last word
219 mov ar.lc = loopcnt;; // set the loop counter
221 (p[0]) ld8 r[0] = [src], -8
222 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
224 br.cond.sptk .restore_and_exit
226 adds src = -1, src // src points to the last byte
227 adds dest = -1, dest // dest points to the last byte
228 adds loopcnt = -1, len;; // loopcnt = len - 1
229 mov ar.lc = loopcnt;; // set the loop counter
231 (p[0]) ld1 r[0] = [src], -1
232 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
234 br.cond.sptk .restore_and_exit
240 data8 0 // dummy entry
241 data8 .loop56 - .loop8
242 data8 .loop56 - .loop16
243 data8 .loop56 - .loop24
244 data8 .loop56 - .loop32
245 data8 .loop56 - .loop40
246 data8 .loop56 - .loop48
247 data8 .loop56 - .loop56
249 libc_hidden_builtin_def (memmove)