Define XTABS to TAB3 on alpha to match Linux 4.16.
[glibc.git] / sysdeps / tile / wordcopy.c
blobba73a0be0731a02ba939393c8d57dbe8fdf256e2
1 /* wordcopy.c -- subroutines for memory copy functions. Tile version.
2 Copyright (C) 1991-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* To optimize for tile, we make the following changes from the
20 default glibc version:
21 - Use the double align instruction instead of the MERGE macro.
22 - Since we don't have offset addressing mode, make sure the loads /
23 stores in the inner loop always have indices of 0.
24 - Use post-increment addresses in the inner loops, which yields
25 better scheduling. */
27 /* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */
29 #include <stddef.h>
30 #include <memcopy.h>
32 /* Provide the appropriate dblalign builtin to shift two registers
33 based on the alignment of a pointer held in a third register. */
34 #define DBLALIGN __insn_dblalign
36 /* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
37 block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
38 Both SRCP and DSTP should be aligned for memory operations on `op_t's. */
40 void
41 _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
43 op_t a0, a1;
45 switch (len % 8)
47 case 2:
48 a0 = ((op_t *) srcp)[0];
49 srcp += OPSIZ;
50 len += 6;
51 goto do1;
52 case 3:
53 a1 = ((op_t *) srcp)[0];
54 srcp += OPSIZ;
55 len += 5;
56 goto do2;
57 case 4:
58 a0 = ((op_t *) srcp)[0];
59 srcp += OPSIZ;
60 len += 4;
61 goto do3;
62 case 5:
63 a1 = ((op_t *) srcp)[0];
64 srcp += OPSIZ;
65 len += 3;
66 goto do4;
67 case 6:
68 a0 = ((op_t *) srcp)[0];
69 srcp += OPSIZ;
70 len += 2;
71 goto do5;
72 case 7:
73 a1 = ((op_t *) srcp)[0];
74 srcp += OPSIZ;
75 len += 1;
76 goto do6;
78 case 0:
79 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
80 return;
81 a0 = ((op_t *) srcp)[0];
82 srcp += OPSIZ;
83 goto do7;
84 case 1:
85 a1 = ((op_t *) srcp)[0];
86 srcp += OPSIZ;
87 len -= 1;
88 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
89 goto do0;
90 goto do8; /* No-op. */
95 do8:
96 a0 = ((op_t *) srcp)[0];
97 ((op_t *) dstp)[0] = a1;
98 srcp += OPSIZ;
99 dstp += OPSIZ;
100 do7:
101 a1 = ((op_t *) srcp)[0];
102 ((op_t *) dstp)[0] = a0;
103 srcp += OPSIZ;
104 dstp += OPSIZ;
105 do6:
106 a0 = ((op_t *) srcp)[0];
107 ((op_t *) dstp)[0] = a1;
108 srcp += OPSIZ;
109 dstp += OPSIZ;
110 do5:
111 a1 = ((op_t *) srcp)[0];
112 ((op_t *) dstp)[0] = a0;
113 srcp += OPSIZ;
114 dstp += OPSIZ;
115 do4:
116 a0 = ((op_t *) srcp)[0];
117 ((op_t *) dstp)[0] = a1;
118 srcp += OPSIZ;
119 dstp += OPSIZ;
120 do3:
121 a1 = ((op_t *) srcp)[0];
122 ((op_t *) dstp)[0] = a0;
123 srcp += OPSIZ;
124 dstp += OPSIZ;
125 do2:
126 a0 = ((op_t *) srcp)[0];
127 ((op_t *) dstp)[0] = a1;
128 srcp += OPSIZ;
129 dstp += OPSIZ;
130 do1:
131 a1 = ((op_t *) srcp)[0];
132 ((op_t *) dstp)[0] = a0;
133 srcp += OPSIZ;
134 dstp += OPSIZ;
136 len -= 8;
138 while (len != 0);
140 /* This is the right position for do0. Please don't move
141 it into the loop. */
142 do0:
143 ((op_t *) dstp)[0] = a1;
146 /* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
147 block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
148 DSTP should be aligned for memory operations on `op_t's, but SRCP must
149 *not* be aligned. */
151 void
152 _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
154 void * srci;
155 op_t a0, a1, a2, a3;
157 /* Save the initial source pointer so we know the number of bytes to
158 shift for merging two unaligned results. */
159 srci = (void *) srcp;
161 /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
162 it points in the middle of. */
163 srcp &= -OPSIZ;
165 switch (len % 4)
167 case 2:
168 a1 = ((op_t *) srcp)[0];
169 a2 = ((op_t *) srcp)[1];
170 len += 2;
171 srcp += 2 * OPSIZ;
172 goto do1;
173 case 3:
174 a0 = ((op_t *) srcp)[0];
175 a1 = ((op_t *) srcp)[1];
176 len += 1;
177 srcp += 2 * OPSIZ;
178 goto do2;
179 case 0:
180 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
181 return;
182 a3 = ((op_t *) srcp)[0];
183 a0 = ((op_t *) srcp)[1];
184 len += 0;
185 srcp += 2 * OPSIZ;
186 goto do3;
187 case 1:
188 a2 = ((op_t *) srcp)[0];
189 a3 = ((op_t *) srcp)[1];
190 srcp += 2 * OPSIZ;
191 len -= 1;
192 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
193 goto do0;
194 goto do4; /* No-op. */
199 do4:
200 a0 = ((op_t *) srcp)[0];
201 a2 = DBLALIGN (a2, a3, srci);
202 ((op_t *) dstp)[0] = a2;
203 srcp += OPSIZ;
204 dstp += OPSIZ;
205 do3:
206 a1 = ((op_t *) srcp)[0];
207 a3 = DBLALIGN (a3, a0, srci);
208 ((op_t *) dstp)[0] = a3;
209 srcp += OPSIZ;
210 dstp += OPSIZ;
211 do2:
212 a2 = ((op_t *) srcp)[0];
213 a0 = DBLALIGN (a0, a1, srci);
214 ((op_t *) dstp)[0] = a0;
215 srcp += OPSIZ;
216 dstp += OPSIZ;
217 do1:
218 a3 = ((op_t *) srcp)[0];
219 a1 = DBLALIGN (a1, a2, srci);
220 ((op_t *) dstp)[0] = a1;
221 srcp += OPSIZ;
222 dstp += OPSIZ;
223 len -= 4;
225 while (len != 0);
227 /* This is the right position for do0. Please don't move
228 it into the loop. */
229 do0:
230 ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
233 /* _wordcopy_bwd_aligned -- Copy block finishing right before
234 SRCP to block finishing right before DSTP with LEN `op_t' words
235 (not LEN bytes!). Both SRCP and DSTP should be aligned for memory
236 operations on `op_t's. */
238 void
239 _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
241 op_t a0, a1;
242 long int srcp1;
244 srcp1 = srcp - 1 * OPSIZ;
245 srcp -= 2 * OPSIZ;
246 dstp -= 1 * OPSIZ;
248 switch (len % 8)
250 case 2:
251 a0 = ((op_t *) srcp1)[0];
252 len += 6;
253 goto do1;
254 case 3:
255 a1 = ((op_t *) srcp1)[0];
256 len += 5;
257 goto do2;
258 case 4:
259 a0 = ((op_t *) srcp1)[0];
260 len += 4;
261 goto do3;
262 case 5:
263 a1 = ((op_t *) srcp1)[0];
264 len += 3;
265 goto do4;
266 case 6:
267 a0 = ((op_t *) srcp1)[0];
268 len += 2;
269 goto do5;
270 case 7:
271 a1 = ((op_t *) srcp1)[0];
272 len += 1;
273 goto do6;
275 case 0:
276 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
277 return;
278 a0 = ((op_t *) srcp1)[0];
279 goto do7;
280 case 1:
281 a1 = ((op_t *) srcp1)[0];
282 len -= 1;
283 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
284 goto do0;
285 goto do8; /* No-op. */
290 do8:
291 a0 = ((op_t *) srcp)[0];
292 ((op_t *) dstp)[0] = a1;
293 srcp -= OPSIZ;
294 dstp -= OPSIZ;
295 do7:
296 a1 = ((op_t *) srcp)[0];
297 ((op_t *) dstp)[0] = a0;
298 srcp -= OPSIZ;
299 dstp -= OPSIZ;
300 do6:
301 a0 = ((op_t *) srcp)[0];
302 ((op_t *) dstp)[0] = a1;
303 srcp -= OPSIZ;
304 dstp -= OPSIZ;
305 do5:
306 a1 = ((op_t *) srcp)[0];
307 ((op_t *) dstp)[0] = a0;
308 srcp -= OPSIZ;
309 dstp -= OPSIZ;
310 do4:
311 a0 = ((op_t *) srcp)[0];
312 ((op_t *) dstp)[0] = a1;
313 srcp -= OPSIZ;
314 dstp -= OPSIZ;
315 do3:
316 a1 = ((op_t *) srcp)[0];
317 ((op_t *) dstp)[0] = a0;
318 srcp -= OPSIZ;
319 dstp -= OPSIZ;
320 do2:
321 a0 = ((op_t *) srcp)[0];
322 ((op_t *) dstp)[0] = a1;
323 srcp -= OPSIZ;
324 dstp -= OPSIZ;
325 do1:
326 a1 = ((op_t *) srcp)[0];
327 ((op_t *) dstp)[0] = a0;
328 srcp -= OPSIZ;
329 dstp -= OPSIZ;
331 len -= 8;
333 while (len != 0);
335 /* This is the right position for do0. Please don't move
336 it into the loop. */
337 do0:
338 ((op_t *) dstp)[0] = a1;
341 /* _wordcopy_bwd_dest_aligned -- Copy block finishing right
342 before SRCP to block finishing right before DSTP with LEN `op_t'
343 words (not LEN bytes!). DSTP should be aligned for memory
344 operations on `op_t', but SRCP must *not* be aligned. */
346 void
347 _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
349 void * srci;
350 op_t a0, a1, a2, a3;
351 op_t b0, b1, b2, b3;
353 /* Save the initial source pointer so we know the number of bytes to
354 shift for merging two unaligned results. */
355 srci = (void *) srcp;
357 /* Make SRCP aligned by rounding it down to the beginning of the op_t
358 it points in the middle of. */
359 srcp &= -OPSIZ;
360 srcp += OPSIZ;
362 switch (len % 4)
364 case 2:
365 srcp -= 3 * OPSIZ;
366 dstp -= 1 * OPSIZ;
367 b2 = ((op_t *) srcp)[2];
368 b1 = a1 = ((op_t *) srcp)[1];
369 len += 2;
370 goto do1;
371 case 3:
372 srcp -= 3 * OPSIZ;
373 dstp -= 1 * OPSIZ;
374 b3 = ((op_t *) srcp)[2];
375 b2 = a2 = ((op_t *) srcp)[1];
376 len += 1;
377 goto do2;
378 case 0:
379 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
380 return;
381 srcp -= 3 * OPSIZ;
382 dstp -= 1 * OPSIZ;
383 b0 = ((op_t *) srcp)[2];
384 b3 = a3 = ((op_t *) srcp)[1];
385 goto do3;
386 case 1:
387 srcp -= 3 * OPSIZ;
388 dstp -= 1 * OPSIZ;
389 b1 = ((op_t *) srcp)[2];
390 b0 = a0 = ((op_t *) srcp)[1];
391 len -= 1;
392 if (OP_T_THRES <= 3 * OPSIZ && len == 0)
393 goto do0;
394 goto do4; /* No-op. */
399 do4:
400 b3 = a3 = ((op_t *) srcp)[0];
401 a0 = DBLALIGN (a0, b1, srci);
402 ((op_t *) dstp)[0] = a0;
403 srcp -= OPSIZ;
404 dstp -= OPSIZ;
405 do3:
406 b2 = a2 = ((op_t *) srcp)[0];
407 a3 = DBLALIGN (a3, b0, srci);
408 ((op_t *) dstp)[0] = a3;
409 srcp -= OPSIZ;
410 dstp -= OPSIZ;
411 do2:
412 b1 = a1 = ((op_t *) srcp)[0];
413 a2 = DBLALIGN (a2, b3, srci);
414 ((op_t *) dstp)[0] = a2;
415 srcp -= OPSIZ;
416 dstp -= OPSIZ;
417 do1:
418 b0 = a0 = ((op_t *) srcp)[0];
419 a1 = DBLALIGN (a1, b2, srci);
420 ((op_t *) dstp)[0] = a1;
421 srcp -= OPSIZ;
422 dstp -= OPSIZ;
424 len -= 4;
426 while (len != 0);
428 /* This is the right position for do0. Please don't move
429 it into the loop. */
430 do0:
431 a0 = DBLALIGN (a0, b1, srci);
432 ((op_t *) dstp)[0] = a0;