1 /* Copyright (C) 2011-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
23 #include <arch/chip.h>
25 /* How many cache lines ahead should we prefetch? */
26 #define PREFETCH_LINES_AHEAD 3
28 void * inhibit_loop_to_libcall
29 __memcpy (void *__restrict dstv
, const void *__restrict srcv
, size_t n
)
31 char *__restrict dst1
= (char *) dstv
;
32 const char *__restrict src1
= (const char *) srcv
;
33 const char *__restrict src1_end
;
34 const char *__restrict prefetch
;
35 op_t
*__restrict dst8
; /* 8-byte pointer to destination memory. */
36 op_t final
; /* Final bytes to write to trailing word, if any */
46 /* Locate the end of source memory we will copy. Don't prefetch
48 src1_end
= src1
+ n
- 1;
50 /* Prefetch ahead a few cache lines, but not past the end. */
52 for (i
= 0; i
< PREFETCH_LINES_AHEAD
; i
++)
54 __insn_prefetch (prefetch
);
55 prefetch
+= CHIP_L2_LINE_SIZE ();
56 prefetch
= (prefetch
< src1_end
) ? prefetch
: src1
;
59 /* Copy bytes until dst is word-aligned. */
60 for (; (uintptr_t) dst1
& (sizeof (op_t
) - 1); n
--)
63 /* 8-byte pointer to destination memory. */
66 if (__builtin_expect ((uintptr_t) src1
& (sizeof (op_t
) - 1), 0))
68 /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but
69 inline it to avoid prologue/epilogue. TODO: Consider
70 prefetching and using wh64 as well. */
73 long int dstp
= (long int) dst1
;
74 long int srcp
= (long int) src1
;
75 long int len
= n
/ OPSIZ
;
77 /* Save the initial source pointer so we know the number of
78 bytes to shift for merging two unaligned results. */
81 /* Make SRCP aligned by rounding it down to the beginning of the
82 `op_t' it points in the middle of. */
88 a1
= ((op_t
*) srcp
)[0];
89 a2
= ((op_t
*) srcp
)[1];
94 a0
= ((op_t
*) srcp
)[0];
95 a1
= ((op_t
*) srcp
)[1];
100 if (OP_T_THRES
<= 3 * OPSIZ
&& len
== 0)
102 a3
= ((op_t
*) srcp
)[0];
103 a0
= ((op_t
*) srcp
)[1];
108 a2
= ((op_t
*) srcp
)[0];
109 a3
= ((op_t
*) srcp
)[1];
112 if (OP_T_THRES
<= 3 * OPSIZ
&& len
== 0)
114 goto do4
; /* No-op. */
120 a0
= ((op_t
*) srcp
)[0];
121 a2
= __insn_dblalign (a2
, a3
, srci
);
122 ((op_t
*) dstp
)[0] = a2
;
126 a1
= ((op_t
*) srcp
)[0];
127 a3
= __insn_dblalign (a3
, a0
, srci
);
128 ((op_t
*) dstp
)[0] = a3
;
132 a2
= ((op_t
*) srcp
)[0];
133 a0
= __insn_dblalign (a0
, a1
, srci
);
134 ((op_t
*) dstp
)[0] = a0
;
138 a3
= ((op_t
*) srcp
)[0];
139 a1
= __insn_dblalign (a1
, a2
, srci
);
140 ((op_t
*) dstp
)[0] = a1
;
147 /* This is the right position for do0. Please don't move
150 ((op_t
*) dstp
)[0] = __insn_dblalign (a2
, a3
, srci
);
156 a0
= ((const char *) srcp
<= src1_end
) ? ((op_t
*) srcp
)[0] : 0;
158 final
= __insn_dblalign (a3
, a0
, srci
);
159 dst8
= (op_t
*)(dstp
+ OPSIZ
);
165 const op_t
*__restrict src8
= (const op_t
*) src1
;
167 /* src8 and dst8 are both word-aligned. */
168 if (n
>= CHIP_L2_LINE_SIZE ())
170 /* Copy until 'dst' is cache-line-aligned. */
171 for (; (uintptr_t) dst8
& (CHIP_L2_LINE_SIZE () - 1);
175 for (; n
>= CHIP_L2_LINE_SIZE ();)
177 op_t tmp0
, tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
;
179 /* Prefetch and advance to next line to prefetch, but
180 don't go past the end. */
181 __insn_prefetch (prefetch
);
182 prefetch
+= CHIP_L2_LINE_SIZE ();
183 prefetch
= (prefetch
< src1_end
) ? prefetch
:
186 /* Do all the loads before wh64. This is necessary if
187 [src8, src8+7] and [dst8, dst8+7] share the same
188 cache line and dst8 <= src8, as can be the case when
189 called from memmove, or with code tested on x86 whose
190 memcpy always works with forward copies. */
213 #if CHIP_L2_LINE_SIZE() != 64
214 # error "Fix code that assumes particular L2 cache line size."
218 for (; n
>= sizeof (op_t
); n
-= sizeof (op_t
))
221 if (__builtin_expect (n
== 0, 1))
227 /* n != 0 if we get here. Write out any trailing bytes. */
228 dst1
= (char *) dst8
;
229 #ifndef __BIG_ENDIAN__
232 *(uint32_t *) dst1
= final
;
239 *(uint16_t *) dst1
= final
;
245 *(uint8_t *) dst1
= final
;
249 *(uint32_t *) dst1
= final
>> 32;
258 *(uint16_t *) dst1
= final
>> 16;
266 *(uint8_t *) dst1
= final
>> 8;
271 weak_alias (__memcpy
, memcpy
)
272 libc_hidden_builtin_def (memcpy
)