1 /* Copyright (C) 2012-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
48 ENTRY_ALIGN (memcpy, 6)
56 /* Deal with small copies quickly by dropping straight into the
59 /* Copy up to 48 bytes of data. At this point we only need the
60 * bottom 6 bits of count to be accurate. */
61 ands tmp1, count, #0x30
68 ldp A_l, A_h, [src, #-48]
69 stp A_l, A_h, [dst, #-48]
71 ldp A_l, A_h, [src, #-32]
72 stp A_l, A_h, [dst, #-32]
74 ldp A_l, A_h, [src, #-16]
75 stp A_l, A_h, [dst, #-16]
78 ands count, count, #15
81 ldp A_l, A_h, [src, #-16]
83 stp A_l, A_h, [dst, #-16]
88 /* Copy up to 15 bytes of data. Does not assume additional data
100 strh tmp1w, [dst], #2
109 /* We don't much care about the alignment of DST, but we want SRC
110 * to be 128-bit (16 byte) aligned so that we don't cross cache line
111 * boundaries on both loads and stores. */
113 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
115 sub count, count, tmp2
116 /* Copy more data than needed; it's faster than jumping
117 * around copying sub-Quadword quantities. We know that
118 * it can't overrun. */
123 /* There may be less than 63 bytes to go now. */
127 subs count, count, #128
128 b.ge L(cpy_body_large)
129 /* Less than 128 bytes to copy, so handle 64 here and then jump
132 ldp B_l, B_h, [src, #16]
133 ldp C_l, C_h, [src, #32]
134 ldp D_l, D_h, [src, #48]
136 stp B_l, B_h, [dst, #16]
137 stp C_l, C_h, [dst, #32]
138 stp D_l, D_h, [dst, #48]
145 /* Critical loop. Start at a new cache line boundary. Assuming
146 * 64 bytes per line this ensures the entire loop is in one line. */
149 /* There are at least 128 bytes to copy. */
150 ldp A_l, A_h, [src, #0]
151 sub dst, dst, #16 /* Pre-bias. */
152 ldp B_l, B_h, [src, #16]
153 ldp C_l, C_h, [src, #32]
154 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
156 stp A_l, A_h, [dst, #16]
157 ldp A_l, A_h, [src, #16]
158 stp B_l, B_h, [dst, #32]
159 ldp B_l, B_h, [src, #32]
160 stp C_l, C_h, [dst, #48]
161 ldp C_l, C_h, [src, #48]
162 stp D_l, D_h, [dst, #64]!
163 ldp D_l, D_h, [src, #64]!
164 subs count, count, #64
166 stp A_l, A_h, [dst, #16]
167 stp B_l, B_h, [dst, #32]
168 stp C_l, C_h, [dst, #48]
169 stp D_l, D_h, [dst, #64]
171 add dst, dst, #64 + 16
176 libc_hidden_builtin_def (memcpy)