Update copyright notices with scripts/update-copyrights
[glibc.git] / ports / sysdeps / tile / tilegx / memcpy.c
blob5d5df19ef34bb12e8a02303b7fb406084388e68b
1 /* Copyright (C) 2011-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library. If not, see
17 <http://www.gnu.org/licenses/>. */
19 #include <string.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <memcopy.h>
23 #include <arch/chip.h>
25 /* How many cache lines ahead should we prefetch? */
26 #define PREFETCH_LINES_AHEAD 3
28 void *
29 __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
31 char *__restrict dst1 = (char *) dstv;
32 const char *__restrict src1 = (const char *) srcv;
33 const char *__restrict src1_end;
34 const char *__restrict prefetch;
35 op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
36 op_t final; /* Final bytes to write to trailing word, if any */
37 long i;
39 if (n < 16)
41 for (; n; n--)
42 *dst1++ = *src1++;
43 return dstv;
46 /* Locate the end of source memory we will copy. Don't prefetch
47 past this. */
48 src1_end = src1 + n - 1;
50 /* Prefetch ahead a few cache lines, but not past the end. */
51 prefetch = src1;
52 for (i = 0; i < PREFETCH_LINES_AHEAD; i++)
54 __insn_prefetch (prefetch);
55 prefetch += CHIP_L2_LINE_SIZE ();
56 prefetch = (prefetch < src1_end) ? prefetch : src1;
59 /* Copy bytes until dst is word-aligned. */
60 for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
61 *dst1++ = *src1++;
63 /* 8-byte pointer to destination memory. */
64 dst8 = (op_t *) dst1;
66 if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
68 /* Misaligned copy. Use glibc's _wordcopy_fwd_dest_aligned, but
69 inline it to avoid prologue/epilogue. TODO: Consider
70 prefetching and using wh64 as well. */
71 void * srci;
72 op_t a0, a1, a2, a3;
73 long int dstp = (long int) dst1;
74 long int srcp = (long int) src1;
75 long int len = n / OPSIZ;
77 /* Save the initial source pointer so we know the number of
78 bytes to shift for merging two unaligned results. */
79 srci = (void *) srcp;
81 /* Make SRCP aligned by rounding it down to the beginning of the
82 `op_t' it points in the middle of. */
83 srcp &= -OPSIZ;
85 switch (len % 4)
87 case 2:
88 a1 = ((op_t *) srcp)[0];
89 a2 = ((op_t *) srcp)[1];
90 len += 2;
91 srcp += 2 * OPSIZ;
92 goto do1;
93 case 3:
94 a0 = ((op_t *) srcp)[0];
95 a1 = ((op_t *) srcp)[1];
96 len += 1;
97 srcp += 2 * OPSIZ;
98 goto do2;
99 case 0:
100 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
101 return dstv;
102 a3 = ((op_t *) srcp)[0];
103 a0 = ((op_t *) srcp)[1];
104 len += 0;
105 srcp += 2 * OPSIZ;
106 goto do3;
107 case 1:
108 a2 = ((op_t *) srcp)[0];
109 a3 = ((op_t *) srcp)[1];
110 srcp += 2 * OPSIZ;
111 len -= 1;
112 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
113 goto do0;
114 goto do4; /* No-op. */
119 do4:
120 a0 = ((op_t *) srcp)[0];
121 a2 = __insn_dblalign (a2, a3, srci);
122 ((op_t *) dstp)[0] = a2;
123 srcp += OPSIZ;
124 dstp += OPSIZ;
125 do3:
126 a1 = ((op_t *) srcp)[0];
127 a3 = __insn_dblalign (a3, a0, srci);
128 ((op_t *) dstp)[0] = a3;
129 srcp += OPSIZ;
130 dstp += OPSIZ;
131 do2:
132 a2 = ((op_t *) srcp)[0];
133 a0 = __insn_dblalign (a0, a1, srci);
134 ((op_t *) dstp)[0] = a0;
135 srcp += OPSIZ;
136 dstp += OPSIZ;
137 do1:
138 a3 = ((op_t *) srcp)[0];
139 a1 = __insn_dblalign (a1, a2, srci);
140 ((op_t *) dstp)[0] = a1;
141 srcp += OPSIZ;
142 dstp += OPSIZ;
143 len -= 4;
145 while (len != 0);
147 /* This is the right position for do0. Please don't move
148 it into the loop. */
149 do0:
150 ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
152 n = n % OPSIZ;
153 if (n == 0)
154 return dstv;
156 a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
158 final = __insn_dblalign (a3, a0, srci);
159 dst8 = (op_t *)(dstp + OPSIZ);
161 else
163 /* Aligned copy. */
165 const op_t *__restrict src8 = (const op_t *) src1;
167 /* src8 and dst8 are both word-aligned. */
168 if (n >= CHIP_L2_LINE_SIZE ())
170 /* Copy until 'dst' is cache-line-aligned. */
171 for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
172 n -= sizeof (op_t))
173 *dst8++ = *src8++;
175 for (; n >= CHIP_L2_LINE_SIZE ();)
177 op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
179 /* Prefetch and advance to next line to prefetch, but
180 don't go past the end. */
181 __insn_prefetch (prefetch);
182 prefetch += CHIP_L2_LINE_SIZE ();
183 prefetch = (prefetch < src1_end) ? prefetch :
184 (const char *) src8;
186 /* Do all the loads before wh64. This is necessary if
187 [src8, src8+7] and [dst8, dst8+7] share the same
188 cache line and dst8 <= src8, as can be the case when
189 called from memmove, or with code tested on x86 whose
190 memcpy always works with forward copies. */
191 tmp0 = *src8++;
192 tmp1 = *src8++;
193 tmp2 = *src8++;
194 tmp3 = *src8++;
195 tmp4 = *src8++;
196 tmp5 = *src8++;
197 tmp6 = *src8++;
198 tmp7 = *src8++;
200 __insn_wh64 (dst8);
202 *dst8++ = tmp0;
203 *dst8++ = tmp1;
204 *dst8++ = tmp2;
205 *dst8++ = tmp3;
206 *dst8++ = tmp4;
207 *dst8++ = tmp5;
208 *dst8++ = tmp6;
209 *dst8++ = tmp7;
211 n -= 64;
213 #if CHIP_L2_LINE_SIZE() != 64
214 # error "Fix code that assumes particular L2 cache line size."
215 #endif
218 for (; n >= sizeof (op_t); n -= sizeof (op_t))
219 *dst8++ = *src8++;
221 if (__builtin_expect (n == 0, 1))
222 return dstv;
224 final = *src8;
227 /* n != 0 if we get here. Write out any trailing bytes. */
228 dst1 = (char *) dst8;
229 #ifndef __BIG_ENDIAN__
230 if (n & 4)
232 *(uint32_t *) dst1 = final;
233 dst1 += 4;
234 final >>= 32;
235 n &= 3;
237 if (n & 2)
239 *(uint16_t *) dst1 = final;
240 dst1 += 2;
241 final >>= 16;
242 n &= 1;
244 if (n)
245 *(uint8_t *) dst1 = final;
246 #else
247 if (n & 4)
249 *(uint32_t *) dst1 = final >> 32;
250 dst1 += 4;
252 else
254 final >>= 32;
256 if (n & 2)
258 *(uint16_t *) dst1 = final >> 16;
259 dst1 += 2;
261 else
263 final >>= 16;
265 if (n & 1)
266 *(uint8_t *) dst1 = final >> 8;
267 #endif
269 return dstv;
271 weak_alias (__memcpy, memcpy)
272 libc_hidden_builtin_def (memcpy)