1 /* Optimized version of the standard memmove() function.
2 This file is part of the GNU C Library.
3 Copyright (C) 2000-2020 Free Software Foundation, Inc.
4 Contributed by Dan Pop <Dan.Pop@cern.ch>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
27 The core of the function is the memcpy implementation used in memcpy.S.
28 When bytes have to be copied backwards, only the easy case, when
29 all arguments are multiples of 8, is optimised.
31 In this form, it assumes little endian mode. For big endian mode,
32 sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
33 or the UM.be bit should be cleared at the beginning and set at the end. */
58 #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
59 # define ALIGN(n) { nop 0 }
61 # define ALIGN(n) .align n
67 (p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
68 (p[MEMLAT+1]) st8 [dest] = value, 8 ; \
69 (p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
72 br.ctop.sptk .loop##shift ; \
73 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
76 #define Nrot (((2*MEMLAT+3) + 7) & ~7)
80 alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
81 .rotr r[MEMLAT + 2], q[MEMLAT + 1]
83 mov ret0 = in0 // return value = dest
85 mov saved_pr = pr // save the predicate registers
87 mov saved_lc = ar.lc // save the loop counter
89 or tmp3 = in0, in1 ;; // tmp3 = dest | src
90 or tmp3 = tmp3, in2 // tmp3 = dest | src | len
91 mov dest = in0 // dest
94 sub tmp2 = r0, in0 // tmp2 = -dest
95 cmp.eq p6, p0 = in2, r0 // if (len == 0)
96 (p6) br.cond.spnt .restore_and_exit;;// return dest;
97 and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
98 cmp.le p6, p0 = dest, src // if dest <= src it's always safe
99 (p6) br.cond.spnt .forward // to copy forward
100 add tmp3 = src, len;;
101 cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
102 (p6) br.cond.spnt .backward // we have to copy backward
105 shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
106 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
107 (p6) br.cond.sptk .next // goto next;
109 // The optimal case, when dest, src and len are all multiples of 8
112 mov pr.rot = 1 << 16 // set rotating predicates
113 mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
114 cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
115 adds loopcnt = -1, loopcnt;; // --loopcnt
116 (p6) ld8 value = [src], 8;;
117 (p6) st8 [dest] = value, 8 // copy the "odd" word
118 mov ar.lc = loopcnt // set the loop counter
119 cmp.eq p6, p0 = 8, len
120 (p6) br.cond.spnt .restore_and_exit;;// the one-word special case
121 adds adest = 8, dest // set adest one word ahead of dest
122 adds asrc = 8, src ;; // set asrc one word ahead of src
123 nop.b 0 // get the "golden" alignment for
124 nop.b 0 // the next loop
126 (p[0]) ld8 r[0] = [src], 16
127 (p[0]) ld8 q[0] = [asrc], 16
128 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
129 (p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
132 mov pr = saved_pr, -1 // restore the predicate registers
133 mov ar.lc = saved_lc // restore the loop counter
136 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
137 and loopcnt = 7, tmp2 // loopcnt = -dest % 8
138 (p6) br.cond.spnt .cpyfew // copy byte by byte
140 cmp.eq p6, p0 = loopcnt, r0
141 (p6) br.cond.sptk .dest_aligned
142 sub len = len, loopcnt // len -= -dest % 8
143 adds loopcnt = -1, loopcnt // --loopcnt
146 .l1: // copy -dest % 8 bytes
147 ld1 value = [src], 1 // value = *src++
149 st1 [dest] = value, 1 // *dest++ = value
152 and sh1 = 7, src // sh1 = src % 8
153 and tmp2 = -8, len // tmp2 = len & -OPSIZ
154 and asrc = -8, src // asrc = src & -OPSIZ -- align src
155 shr.u loopcnt = len, 3 // loopcnt = len / 8
156 and len = 7, len;; // len = len % 8
157 adds loopcnt = -1, loopcnt // --loopcnt
158 addl tmp4 = @ltoff(.table), gp
159 addl tmp3 = @ltoff(.loop56), gp
160 mov ar.ec = MEMLAT + 1 // set EC
161 mov pr.rot = 1 << 16;; // set rotating predicates
162 mov ar.lc = loopcnt // set LC
163 cmp.eq p6, p0 = sh1, r0 // is the src aligned?
164 (p6) br.cond.sptk .src_aligned
165 add src = src, tmp2 // src += len & -OPSIZ
166 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
167 ld8 ploop56 = [tmp3] // ploop56 = &loop56
168 ld8 ptable = [tmp4];; // ptable = &table
169 add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
170 mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
171 ld8 tmp4 = [tmp3];; // tmp4 = loop offset
172 sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
173 ld8 r[1] = [asrc], 8;; // w0
175 br b6 // jump to the appropriate loop
187 (p[0]) ld8 r[0] = [src], 8
188 (p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
191 cmp.eq p6, p0 = len, r0 // is len == 0 ?
192 adds len = -1, len // --len;
193 (p6) br.cond.spnt .restore_and_exit ;;
198 st1 [dest] = value, 1
201 mov pr = saved_pr, -1 // restore the predicate registers
202 mov ar.lc = saved_lc // restore the loop counter
205 // In the case of a backward copy, optimise only the case when everything
206 // is a multiple of 8, otherwise copy byte by byte. The backward copy is
207 // used only when the blocks are overlapping and dest > src.
210 shr.u loopcnt = len, 3 // loopcnt = len / 8
211 add src = src, len // src points one byte past the end
212 add dest = dest, len ;; // dest points one byte past the end
213 mov ar.ec = MEMLAT + 1 // set the epilog counter
214 mov pr.rot = 1 << 16 // set rotating predicates
215 adds loopcnt = -1, loopcnt // --loopcnt
216 cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
217 (p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
218 adds src = -8, src // src points to the last word
219 adds dest = -8, dest // dest points to the last word
220 mov ar.lc = loopcnt;; // set the loop counter
222 (p[0]) ld8 r[0] = [src], -8
223 (p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
225 br.cond.sptk .restore_and_exit
227 adds src = -1, src // src points to the last byte
228 adds dest = -1, dest // dest points to the last byte
229 adds loopcnt = -1, len;; // loopcnt = len - 1
230 mov ar.lc = loopcnt;; // set the loop counter
232 (p[0]) ld1 r[0] = [src], -1
233 (p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
235 br.cond.sptk .restore_and_exit
241 data8 0 // dummy entry
242 data8 .loop56 - .loop8
243 data8 .loop56 - .loop16
244 data8 .loop56 - .loop24
245 data8 .loop56 - .loop32
246 data8 .loop56 - .loop40
247 data8 .loop56 - .loop48
248 data8 .loop56 - .loop56
250 libc_hidden_builtin_def (memmove)