1 /* Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
21 /* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
47 /* Alias with A_l and A_h to train the prefetcher. */
53 The copy has 4 distinct parts:
54 * Small copies of 16 bytes and under
55 * Medium sized copies of 17-96 bytes
56 * Large copies where the source address is higher than the destination
58 * Large copies where the destination address is higher than the source
59 (copy backward, or move).
61 We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32
62 bytes at a time to correctly train the hardware prefetcher for better
64 ENTRY_ALIGN (__memmove_falkor, 6)
67 add srcend, src, count
68 add dstend, dstin, count
70 ccmp tmp1, count, 2, hi
78 /* Medium copies: 17..96 bytes. */
81 tbnz tmp1, 6, L(copy96)
82 ldp D_l, D_h, [srcend, -16]
84 ldp B_l, B_h, [src, 16]
85 ldp C_l, C_h, [srcend, -32]
86 stp B_l, B_h, [dstin, 16]
87 stp C_l, C_h, [dstend, -32]
90 stp D_l, D_h, [dstend, -16]
94 /* Small copies: 0..16 bytes. */
101 str A_h, [dstend, -8]
108 ldr A_hw, [srcend, -4]
110 str A_hw, [dstend, -4]
117 ldrh A_hw, [srcend, -2]
119 strh A_hw, [dstend, -2]
130 /* Copy 64..96 bytes. Copy 64 bytes from the start and
131 32 bytes from the end. */
133 ldp B_l, B_h, [src, 16]
134 ldp C_l, C_h, [src, 32]
135 ldp D_l, D_h, [src, 48]
136 ldp E_l, E_h, [srcend, -32]
137 ldp F_l, F_h, [srcend, -16]
138 stp A_l, A_h, [dstin]
139 stp B_l, B_h, [dstin, 16]
140 stp C_l, C_h, [dstin, 32]
141 stp D_l, D_h, [dstin, 48]
142 stp E_l, E_h, [dstend, -32]
143 stp F_l, F_h, [dstend, -16]
146 /* Align SRC to 16 byte alignment so that we don't cross cache line
147 boundaries on both loads and stores. There are at least 96 bytes
148 to copy, so copy 16 bytes unaligned and then align. The loop
149 copies 32 bytes per iteration and prefetches one iteration ahead. */
153 sub count, count, 64 + 16 /* Test and readjust count. */
160 add count, count, tmp1 /* Count is now 16 too large. */
161 ldp Q_l, Q_h, [src, 16]!
162 stp A_l, A_h, [dstin]
163 ldp A_l, A_h, [src, 16]!
166 subs count, count, 32
167 stp Q_l, Q_h, [dst, 16]
168 ldp Q_l, Q_h, [src, 16]!
169 stp A_l, A_h, [dst, 32]!
170 ldp A_l, A_h, [src, 16]!
173 /* Write the last full set of 32 bytes. The remainder is at most 32
174 bytes, so it is safe to always copy 32 bytes from the end even if
175 there is just 1 byte left. */
177 ldp C_l, C_h, [srcend, -32]
178 stp Q_l, Q_h, [dst, 16]
179 ldp Q_l, Q_h, [srcend, -16]
180 stp A_l, A_h, [dst, 32]
181 stp C_l, C_h, [dstend, -32]
182 stp Q_l, Q_h, [dstend, -16]
194 /* Align SRCEND to 16 byte alignment so that we don't cross cache line
195 boundaries on both loads and stores. There are at least 96 bytes
196 to copy, so copy 16 bytes unaligned and then align. The loop
197 copies 32 bytes per iteration and prefetches one iteration ahead. */
199 ldp A_l, A_h, [srcend, -16]
201 sub srcend, srcend, tmp1
202 ldp Q_l, Q_h, [srcend, -16]!
203 stp A_l, A_h, [dstend, -16]
204 sub count, count, tmp1
205 ldp A_l, A_h, [srcend, -16]!
206 sub dstend, dstend, tmp1
210 subs count, count, 32
211 stp Q_l, Q_h, [dstend, -16]
212 ldp Q_l, Q_h, [srcend, -16]!
213 stp A_l, A_h, [dstend, -32]!
214 ldp A_l, A_h, [srcend, -16]!
217 /* Write the last full set of 32 bytes. The remainder is at most 32
218 bytes, so it is safe to always copy 32 bytes from the start even if
219 there is just 1 byte left. */
221 ldp C_l, C_h, [src, 16]
222 stp Q_l, Q_h, [dstend, -16]
224 stp A_l, A_h, [dstend, -32]
225 stp C_l, C_h, [dstin, 16]
226 stp Q_l, Q_h, [dstin]
231 END (__memmove_falkor)
232 libc_hidden_builtin_def (__memmove_falkor)