Fix pread consolidation on ports that require argument alignment
[glibc.git] / sysdeps / tile / wordcopy.c
blob64263883f70951372d8df39613c7633ab6240e05
1 /* wordcopy.c -- subroutines for memory copy functions. Tile version.
2 Copyright (C) 1991-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* To optimize for tile, we make the following changes from the
20 default glibc version:
21 - Use the double align instruction instead of the MERGE macro.
22 - Since we don't have offset addressing mode, make sure the loads /
23 stores in the inner loop always have indices of 0.
24 - Use post-increment addresses in the inner loops, which yields
25 better scheduling. */
27 /* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */
29 #include <stddef.h>
30 #include <memcopy.h>
32 /* Provide the appropriate dblalign builtin to shift two registers
33 based on the alignment of a pointer held in a third register. */
34 #ifdef __tilegx__
35 #define DBLALIGN __insn_dblalign
36 #else
37 #define DBLALIGN __insn_dword_align
38 #endif
40 /* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
41 block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
42 Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
44 void
45 _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
47 op_t a0, a1;
49 switch (len % 8)
51 case 2:
52 a0 = ((op_t *) srcp)[0];
53 srcp += OPSIZ;
54 len += 6;
55 goto do1;
56 case 3:
57 a1 = ((op_t *) srcp)[0];
58 srcp += OPSIZ;
59 len += 5;
60 goto do2;
61 case 4:
62 a0 = ((op_t *) srcp)[0];
63 srcp += OPSIZ;
64 len += 4;
65 goto do3;
66 case 5:
67 a1 = ((op_t *) srcp)[0];
68 srcp += OPSIZ;
69 len += 3;
70 goto do4;
71 case 6:
72 a0 = ((op_t *) srcp)[0];
73 srcp += OPSIZ;
74 len += 2;
75 goto do5;
76 case 7:
77 a1 = ((op_t *) srcp)[0];
78 srcp += OPSIZ;
79 len += 1;
80 goto do6;
82 case 0:
83 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
84 return;
85 a0 = ((op_t *) srcp)[0];
86 srcp += OPSIZ;
87 goto do7;
88 case 1:
89 a1 = ((op_t *) srcp)[0];
90 srcp += OPSIZ;
91 len -= 1;
92 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
93 goto do0;
94 goto do8; /* No-op. */
99 do8:
100 a0 = ((op_t *) srcp)[0];
101 ((op_t *) dstp)[0] = a1;
102 srcp += OPSIZ;
103 dstp += OPSIZ;
104 do7:
105 a1 = ((op_t *) srcp)[0];
106 ((op_t *) dstp)[0] = a0;
107 srcp += OPSIZ;
108 dstp += OPSIZ;
109 do6:
110 a0 = ((op_t *) srcp)[0];
111 ((op_t *) dstp)[0] = a1;
112 srcp += OPSIZ;
113 dstp += OPSIZ;
114 do5:
115 a1 = ((op_t *) srcp)[0];
116 ((op_t *) dstp)[0] = a0;
117 srcp += OPSIZ;
118 dstp += OPSIZ;
119 do4:
120 a0 = ((op_t *) srcp)[0];
121 ((op_t *) dstp)[0] = a1;
122 srcp += OPSIZ;
123 dstp += OPSIZ;
124 do3:
125 a1 = ((op_t *) srcp)[0];
126 ((op_t *) dstp)[0] = a0;
127 srcp += OPSIZ;
128 dstp += OPSIZ;
129 do2:
130 a0 = ((op_t *) srcp)[0];
131 ((op_t *) dstp)[0] = a1;
132 srcp += OPSIZ;
133 dstp += OPSIZ;
134 do1:
135 a1 = ((op_t *) srcp)[0];
136 ((op_t *) dstp)[0] = a0;
137 srcp += OPSIZ;
138 dstp += OPSIZ;
140 len -= 8;
142 while (len != 0);
144 /* This is the right position for do0. Please don't move
145 it into the loop. */
146 do0:
147 ((op_t *) dstp)[0] = a1;
150 /* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
151 block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
152 DSTP should be aligned for memory operations on `op_t's, but SRCP must
153 *not* be aligned. */
155 void
156 _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
158 void * srci;
159 op_t a0, a1, a2, a3;
161 /* Save the initial source pointer so we know the number of bytes to
162 shift for merging two unaligned results. */
163 srci = (void *) srcp;
165 /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
166 it points in the middle of. */
167 srcp &= -OPSIZ;
169 switch (len % 4)
171 case 2:
172 a1 = ((op_t *) srcp)[0];
173 a2 = ((op_t *) srcp)[1];
174 len += 2;
175 srcp += 2 * OPSIZ;
176 goto do1;
177 case 3:
178 a0 = ((op_t *) srcp)[0];
179 a1 = ((op_t *) srcp)[1];
180 len += 1;
181 srcp += 2 * OPSIZ;
182 goto do2;
183 case 0:
184 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
185 return;
186 a3 = ((op_t *) srcp)[0];
187 a0 = ((op_t *) srcp)[1];
188 len += 0;
189 srcp += 2 * OPSIZ;
190 goto do3;
191 case 1:
192 a2 = ((op_t *) srcp)[0];
193 a3 = ((op_t *) srcp)[1];
194 srcp += 2 * OPSIZ;
195 len -= 1;
196 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
197 goto do0;
198 goto do4; /* No-op. */
203 do4:
204 a0 = ((op_t *) srcp)[0];
205 a2 = DBLALIGN (a2, a3, srci);
206 ((op_t *) dstp)[0] = a2;
207 srcp += OPSIZ;
208 dstp += OPSIZ;
209 do3:
210 a1 = ((op_t *) srcp)[0];
211 a3 = DBLALIGN (a3, a0, srci);
212 ((op_t *) dstp)[0] = a3;
213 srcp += OPSIZ;
214 dstp += OPSIZ;
215 do2:
216 a2 = ((op_t *) srcp)[0];
217 a0 = DBLALIGN (a0, a1, srci);
218 ((op_t *) dstp)[0] = a0;
219 srcp += OPSIZ;
220 dstp += OPSIZ;
221 do1:
222 a3 = ((op_t *) srcp)[0];
223 a1 = DBLALIGN (a1, a2, srci);
224 ((op_t *) dstp)[0] = a1;
225 srcp += OPSIZ;
226 dstp += OPSIZ;
227 len -= 4;
229 while (len != 0);
231 /* This is the right position for do0. Please don't move
232 it into the loop. */
233 do0:
234 ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
237 /* _wordcopy_bwd_aligned -- Copy block finishing right before
238 SRCP to block finishing right before DSTP with LEN `op_t' words
239 (not LEN bytes!). Both SRCP and DSTP should be aligned for memory
240 operations on `op_t's. */
242 void
243 _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
245 op_t a0, a1;
246 long int srcp1;
248 srcp1 = srcp - 1 * OPSIZ;
249 srcp -= 2 * OPSIZ;
250 dstp -= 1 * OPSIZ;
252 switch (len % 8)
254 case 2:
255 a0 = ((op_t *) srcp1)[0];
256 len += 6;
257 goto do1;
258 case 3:
259 a1 = ((op_t *) srcp1)[0];
260 len += 5;
261 goto do2;
262 case 4:
263 a0 = ((op_t *) srcp1)[0];
264 len += 4;
265 goto do3;
266 case 5:
267 a1 = ((op_t *) srcp1)[0];
268 len += 3;
269 goto do4;
270 case 6:
271 a0 = ((op_t *) srcp1)[0];
272 len += 2;
273 goto do5;
274 case 7:
275 a1 = ((op_t *) srcp1)[0];
276 len += 1;
277 goto do6;
279 case 0:
280 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
281 return;
282 a0 = ((op_t *) srcp1)[0];
283 goto do7;
284 case 1:
285 a1 = ((op_t *) srcp1)[0];
286 len -= 1;
287 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
288 goto do0;
289 goto do8; /* No-op. */
294 do8:
295 a0 = ((op_t *) srcp)[0];
296 ((op_t *) dstp)[0] = a1;
297 srcp -= OPSIZ;
298 dstp -= OPSIZ;
299 do7:
300 a1 = ((op_t *) srcp)[0];
301 ((op_t *) dstp)[0] = a0;
302 srcp -= OPSIZ;
303 dstp -= OPSIZ;
304 do6:
305 a0 = ((op_t *) srcp)[0];
306 ((op_t *) dstp)[0] = a1;
307 srcp -= OPSIZ;
308 dstp -= OPSIZ;
309 do5:
310 a1 = ((op_t *) srcp)[0];
311 ((op_t *) dstp)[0] = a0;
312 srcp -= OPSIZ;
313 dstp -= OPSIZ;
314 do4:
315 a0 = ((op_t *) srcp)[0];
316 ((op_t *) dstp)[0] = a1;
317 srcp -= OPSIZ;
318 dstp -= OPSIZ;
319 do3:
320 a1 = ((op_t *) srcp)[0];
321 ((op_t *) dstp)[0] = a0;
322 srcp -= OPSIZ;
323 dstp -= OPSIZ;
324 do2:
325 a0 = ((op_t *) srcp)[0];
326 ((op_t *) dstp)[0] = a1;
327 srcp -= OPSIZ;
328 dstp -= OPSIZ;
329 do1:
330 a1 = ((op_t *) srcp)[0];
331 ((op_t *) dstp)[0] = a0;
332 srcp -= OPSIZ;
333 dstp -= OPSIZ;
335 len -= 8;
337 while (len != 0);
339 /* This is the right position for do0. Please don't move
340 it into the loop. */
341 do0:
342 ((op_t *) dstp)[0] = a1;
345 /* _wordcopy_bwd_dest_aligned -- Copy block finishing right
346 before SRCP to block finishing right before DSTP with LEN `op_t'
347 words (not LEN bytes!). DSTP should be aligned for memory
348 operations on `op_t', but SRCP must *not* be aligned. */
350 void
351 _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
353 void * srci;
354 op_t a0, a1, a2, a3;
355 op_t b0, b1, b2, b3;
357 /* Save the initial source pointer so we know the number of bytes to
358 shift for merging two unaligned results. */
359 srci = (void *) srcp;
361 /* Make SRCP aligned by rounding it down to the beginning of the op_t
362 it points in the middle of. */
363 srcp &= -OPSIZ;
364 srcp += OPSIZ;
366 switch (len % 4)
368 case 2:
369 srcp -= 3 * OPSIZ;
370 dstp -= 1 * OPSIZ;
371 b2 = ((op_t *) srcp)[2];
372 b1 = a1 = ((op_t *) srcp)[1];
373 len += 2;
374 goto do1;
375 case 3:
376 srcp -= 3 * OPSIZ;
377 dstp -= 1 * OPSIZ;
378 b3 = ((op_t *) srcp)[2];
379 b2 = a2 = ((op_t *) srcp)[1];
380 len += 1;
381 goto do2;
382 case 0:
383 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
384 return;
385 srcp -= 3 * OPSIZ;
386 dstp -= 1 * OPSIZ;
387 b0 = ((op_t *) srcp)[2];
388 b3 = a3 = ((op_t *) srcp)[1];
389 goto do3;
390 case 1:
391 srcp -= 3 * OPSIZ;
392 dstp -= 1 * OPSIZ;
393 b1 = ((op_t *) srcp)[2];
394 b0 = a0 = ((op_t *) srcp)[1];
395 len -= 1;
396 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
397 goto do0;
398 goto do4; /* No-op. */
403 do4:
404 b3 = a3 = ((op_t *) srcp)[0];
405 a0 = DBLALIGN (a0, b1, srci);
406 ((op_t *) dstp)[0] = a0;
407 srcp -= OPSIZ;
408 dstp -= OPSIZ;
409 do3:
410 b2 = a2 = ((op_t *) srcp)[0];
411 a3 = DBLALIGN (a3, b0, srci);
412 ((op_t *) dstp)[0] = a3;
413 srcp -= OPSIZ;
414 dstp -= OPSIZ;
415 do2:
416 b1 = a1 = ((op_t *) srcp)[0];
417 a2 = DBLALIGN (a2, b3, srci);
418 ((op_t *) dstp)[0] = a2;
419 srcp -= OPSIZ;
420 dstp -= OPSIZ;
421 do1:
422 b0 = a0 = ((op_t *) srcp)[0];
423 a1 = DBLALIGN (a1, b2, srci);
424 ((op_t *) dstp)[0] = a1;
425 srcp -= OPSIZ;
426 dstp -= OPSIZ;
428 len -= 4;
430 while (len != 0);
432 /* This is the right position for do0. Please don't move
433 it into the loop. */
434 do0:
435 a0 = DBLALIGN (a0, b1, srci);
436 ((op_t *) dstp)[0] = a0;