1 /* Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
27 /* Parameters and result. */
48 ENTRY_ALIGN (memmove, 6)
54 b.hs memcpy /* No overlap. */
56 /* Upwards move with potential overlap.
57 * Need to move from the tail backwards. SRC and DST point one
58 * byte beyond the remaining data to move. */
62 b.ge L(mov_not_short_up)
64 /* Deal with small moves quickly by dropping straight into the
67 /* Move up to 48 bytes of data. At this point we only need the
68 * bottom 6 bits of count to be accurate. */
69 ands tmp1, count, #0x30
76 ldp A_l, A_h, [src, #32]
77 stp A_l, A_h, [dst, #32]
79 ldp A_l, A_h, [src, #16]
80 stp A_l, A_h, [dst, #16]
85 /* Move up to 15 bytes of data. Does not assume additional data
92 ldr tmp1w, [src, #-4]!
93 str tmp1w, [dst, #-4]!
96 ldrh tmp1w, [src, #-2]!
97 strh tmp1w, [dst, #-2]!
100 ldrb tmp1w, [src, #-1]
101 strb tmp1w, [dst, #-1]
106 /* We don't much care about the alignment of DST, but we want SRC
107 * to be 128-bit (16 byte) aligned so that we don't cross cache line
108 * boundaries on both loads and stores. */
109 ands tmp2, src, #15 /* Bytes to reach alignment. */
111 sub count, count, tmp2
112 /* Move enough data to reach alignment; unlike memcpy, we have to
113 * be aware of the overlap, which means we can't move data twice. */
115 ldr tmp1, [src, #-8]!
116 str tmp1, [dst, #-8]!
119 ldr tmp1w, [src, #-4]!
120 str tmp1w, [dst, #-4]!
123 ldrh tmp1w, [src, #-2]!
124 strh tmp1w, [dst, #-2]!
127 ldrb tmp1w, [src, #-1]!
128 strb tmp1w, [dst, #-1]!
131 /* There may be less than 63 bytes to go now. */
135 subs count, count, #128
136 b.ge L(mov_body_large_up)
137 /* Less than 128 bytes to move, so handle 64 here and then jump
139 ldp A_l, A_h, [src, #-64]!
140 ldp B_l, B_h, [src, #16]
141 ldp C_l, C_h, [src, #32]
142 ldp D_l, D_h, [src, #48]
143 stp A_l, A_h, [dst, #-64]!
144 stp B_l, B_h, [dst, #16]
145 stp C_l, C_h, [dst, #32]
146 stp D_l, D_h, [dst, #48]
151 /* Critical loop. Start at a new Icache line boundary. Assuming
152 * 64 bytes per line this ensures the entire loop is in one line. */
154 L(mov_body_large_up):
155 /* There are at least 128 bytes to move. */
156 ldp A_l, A_h, [src, #-16]
157 ldp B_l, B_h, [src, #-32]
158 ldp C_l, C_h, [src, #-48]
159 ldp D_l, D_h, [src, #-64]!
161 stp A_l, A_h, [dst, #-16]
162 ldp A_l, A_h, [src, #-16]
163 stp B_l, B_h, [dst, #-32]
164 ldp B_l, B_h, [src, #-32]
165 stp C_l, C_h, [dst, #-48]
166 ldp C_l, C_h, [src, #-48]
167 stp D_l, D_h, [dst, #-64]!
168 ldp D_l, D_h, [src, #-64]!
169 subs count, count, #64
171 stp A_l, A_h, [dst, #-16]
172 stp B_l, B_h, [dst, #-32]
173 stp C_l, C_h, [dst, #-48]
174 stp D_l, D_h, [dst, #-64]!
180 /* For a downwards move we can safely use memcpy provided that
181 * DST is more than 16 bytes away from SRC. */
184 b.ls memcpy /* May overlap, but not critically. */
186 mov dst, dstin /* Preserve DSTIN for return value. */
188 b.ge L(mov_not_short_down)
190 /* Deal with small moves quickly by dropping straight into the
193 /* Move up to 48 bytes of data. At this point we only need the
194 * bottom 6 bits of count to be accurate. */
195 ands tmp1, count, #0x30
202 ldp A_l, A_h, [src, #-48]
203 stp A_l, A_h, [dst, #-48]
205 ldp A_l, A_h, [src, #-32]
206 stp A_l, A_h, [dst, #-32]
208 ldp A_l, A_h, [src, #-16]
209 stp A_l, A_h, [dst, #-16]
211 /* Move up to 15 bytes of data. Does not assume additional data
222 ldrh tmp1w, [src], #2
223 strh tmp1w, [dst], #2
231 L(mov_not_short_down):
232 /* We don't much care about the alignment of DST, but we want SRC
233 * to be 128-bit (16 byte) aligned so that we don't cross cache line
234 * boundaries on both loads and stores. */
236 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
238 sub count, count, tmp2
239 /* Move enough data to reach alignment; unlike memcpy, we have to
240 * be aware of the overlap, which means we can't move data twice. */
250 ldrh tmp1w, [src], #2
251 strh tmp1w, [dst], #2
254 ldrb tmp1w, [src], #1
255 strb tmp1w, [dst], #1
258 /* There may be less than 63 bytes to go now. */
262 subs count, count, #128
263 b.ge L(mov_body_large_down)
264 /* Less than 128 bytes to move, so handle 64 here and then jump
267 ldp B_l, B_h, [src, #16]
268 ldp C_l, C_h, [src, #32]
269 ldp D_l, D_h, [src, #48]
271 stp B_l, B_h, [dst, #16]
272 stp C_l, C_h, [dst, #32]
273 stp D_l, D_h, [dst, #48]
280 /* Critical loop. Start at a new cache line boundary. Assuming
281 * 64 bytes per line this ensures the entire loop is in one line. */
283 L(mov_body_large_down):
284 /* There are at least 128 bytes to move. */
285 ldp A_l, A_h, [src, #0]
286 sub dst, dst, #16 /* Pre-bias. */
287 ldp B_l, B_h, [src, #16]
288 ldp C_l, C_h, [src, #32]
289 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
291 stp A_l, A_h, [dst, #16]
292 ldp A_l, A_h, [src, #16]
293 stp B_l, B_h, [dst, #32]
294 ldp B_l, B_h, [src, #32]
295 stp C_l, C_h, [dst, #48]
296 ldp C_l, C_h, [src, #48]
297 stp D_l, D_h, [dst, #64]!
298 ldp D_l, D_h, [src, #64]!
299 subs count, count, #64
301 stp A_l, A_h, [dst, #16]
302 stp B_l, B_h, [dst, #32]
303 stp C_l, C_h, [dst, #48]
304 stp D_l, D_h, [dst, #64]
306 add dst, dst, #64 + 16
312 libc_hidden_builtin_def (memmove)