1 /* memcpy.S: Sparc optimized memcpy, bcopy and memmove code
2 * Hand optimized from GNU libc's memcpy, bcopy and memmove
3 * Copyright (C) 1991,1996 Free Software Foundation
4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
12 #include <asm/cprefix.h>
16 .type C_LABEL(x),@function; \
21 #undef FASTER_NONALIGNED
22 #define FASTER_ALIGNED
24 /* In kernel these functions don't return a value.
25 * One should use macros in asm/string.h for that purpose.
26 * We return 0, so that bugs are more apparent.
29 #define RETL_INSN clr %o0
37 #define FASTER_REVERSE
38 #define FASTER_NONALIGNED
39 #define FASTER_ALIGNED
41 #define SETUP_RETL mov %o0, %g6
42 #define RETL_INSN mov %g6, %o0
46 /* Both these macros have to start with exactly the same insn */
47 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
48 ldd [%src + offset + 0x00], %t0; \
49 ldd [%src + offset + 0x08], %t2; \
50 ldd [%src + offset + 0x10], %t4; \
51 ldd [%src + offset + 0x18], %t6; \
52 st %t0, [%dst + offset + 0x00]; \
53 st %t1, [%dst + offset + 0x04]; \
54 st %t2, [%dst + offset + 0x08]; \
55 st %t3, [%dst + offset + 0x0c]; \
56 st %t4, [%dst + offset + 0x10]; \
57 st %t5, [%dst + offset + 0x14]; \
58 st %t6, [%dst + offset + 0x18]; \
59 st %t7, [%dst + offset + 0x1c];
61 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
62 ldd [%src + offset + 0x00], %t0; \
63 ldd [%src + offset + 0x08], %t2; \
64 ldd [%src + offset + 0x10], %t4; \
65 ldd [%src + offset + 0x18], %t6; \
66 std %t0, [%dst + offset + 0x00]; \
67 std %t2, [%dst + offset + 0x08]; \
68 std %t4, [%dst + offset + 0x10]; \
69 std %t6, [%dst + offset + 0x18];
71 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
72 ldd [%src - offset - 0x10], %t0; \
73 ldd [%src - offset - 0x08], %t2; \
74 st %t0, [%dst - offset - 0x10]; \
75 st %t1, [%dst - offset - 0x0c]; \
76 st %t2, [%dst - offset - 0x08]; \
77 st %t3, [%dst - offset - 0x04];
79 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
80 ldd [%src - offset - 0x10], %t0; \
81 ldd [%src - offset - 0x08], %t2; \
82 std %t0, [%dst - offset - 0x10]; \
83 std %t2, [%dst - offset - 0x08];
85 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
86 ldub [%src - offset - 0x02], %t0; \
87 ldub [%src - offset - 0x01], %t1; \
88 stb %t0, [%dst - offset - 0x02]; \
89 stb %t1, [%dst - offset - 0x01];
91 /* Both these macros have to start with exactly the same insn */
92 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
93 ldd [%src - offset - 0x20], %t0; \
94 ldd [%src - offset - 0x18], %t2; \
95 ldd [%src - offset - 0x10], %t4; \
96 ldd [%src - offset - 0x08], %t6; \
97 st %t0, [%dst - offset - 0x20]; \
98 st %t1, [%dst - offset - 0x1c]; \
99 st %t2, [%dst - offset - 0x18]; \
100 st %t3, [%dst - offset - 0x14]; \
101 st %t4, [%dst - offset - 0x10]; \
102 st %t5, [%dst - offset - 0x0c]; \
103 st %t6, [%dst - offset - 0x08]; \
104 st %t7, [%dst - offset - 0x04];
106 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
107 ldd [%src - offset - 0x20], %t0; \
108 ldd [%src - offset - 0x18], %t2; \
109 ldd [%src - offset - 0x10], %t4; \
110 ldd [%src - offset - 0x08], %t6; \
111 std %t0, [%dst - offset - 0x20]; \
112 std %t2, [%dst - offset - 0x18]; \
113 std %t4, [%dst - offset - 0x10]; \
114 std %t6, [%dst - offset - 0x08];
116 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
117 ldd [%src + offset + 0x00], %t0; \
118 ldd [%src + offset + 0x08], %t2; \
119 st %t0, [%dst + offset + 0x00]; \
120 st %t1, [%dst + offset + 0x04]; \
121 st %t2, [%dst + offset + 0x08]; \
122 st %t3, [%dst + offset + 0x0c];
124 #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
125 ldub [%src + offset + 0x00], %t0; \
126 ldub [%src + offset + 0x01], %t1; \
127 stb %t0, [%dst + offset + 0x00]; \
128 stb %t1, [%dst + offset + 0x01];
130 #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
131 ldd [%src + offset + 0x00], %t0; \
132 ldd [%src + offset + 0x08], %t2; \
133 srl %t0, shir, %t5; \
134 srl %t1, shir, %t6; \
135 sll %t0, shil, %t0; \
136 or %t5, %prev, %t5; \
137 sll %t1, shil, %prev; \
139 srl %t2, shir, %t1; \
140 srl %t3, shir, %t6; \
141 sll %t2, shil, %t2; \
142 or %t1, %prev, %t1; \
143 std %t4, [%dst + offset + offset2 - 0x04]; \
144 std %t0, [%dst + offset + offset2 + 0x04]; \
145 sll %t3, shil, %prev; \
148 #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
149 ldd [%src + offset + 0x00], %t0; \
150 ldd [%src + offset + 0x08], %t2; \
151 srl %t0, shir, %t4; \
152 srl %t1, shir, %t5; \
153 sll %t0, shil, %t6; \
154 or %t4, %prev, %t0; \
155 sll %t1, shil, %prev; \
157 srl %t2, shir, %t4; \
158 srl %t3, shir, %t5; \
159 sll %t2, shil, %t6; \
160 or %t4, %prev, %t2; \
161 sll %t3, shil, %prev; \
163 std %t0, [%dst + offset + offset2 + 0x00]; \
164 std %t2, [%dst + offset + offset2 + 0x08];
169 #ifdef FASTER_REVERSE
171 70: /* rdword_align */
191 #endif /* FASTER_REVERSE */
195 nop ! Only bcopy returns here and it retuns void...
203 /* Do the cmp in the delay slot */
219 #ifndef FASTER_REVERSE
226 1: /* reverse_bytes */
238 #else /* FASTER_REVERSE */
259 andcc %g1, 0xffffff80, %g7
265 RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
266 RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
267 RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
268 RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
283 jmpl %o5 + %lo(72f), %g0
286 71: /* rmemcpy_table */
287 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
288 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
289 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
290 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
291 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
292 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
293 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
295 72: /* rmemcpy_table_end */
300 ldd [%o1 - 0x08], %g2
306 73: /* rmemcpy_last7 */
334 RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
335 RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
336 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
337 RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
352 jmpl %o5 + %lo(72b), %g0
364 jmpl %o5 + %lo(76f), %g0
367 RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
368 RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
369 RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
370 RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
371 RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
372 RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
373 RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
375 76: /* rshort_table_end */
385 91: /* rshort_aligned_end */
403 77: /* rnon_aligned */
524 #endif /* FASTER_REVERSE */
526 /* NOTE: This code is executed just for the cases,
527 where %src (=%o1) & 3 is != 0.
528 We need to align it to 4. So, for (%src & 3)
529 1 we need to do ldub,lduh
532 so even if it looks weird, the branches
533 are correct here. -jj
535 78: /* dword_align */
558 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
584 andcc %g1, 0xffffff80, %g7
590 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
591 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
592 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
593 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
608 jmpl %o5 + %lo(80f), %g0
611 79: /* memcpy_table */
613 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
614 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
615 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
616 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
617 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
618 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
619 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
621 80: /* memcpy_table_end */
631 81: /* memcpy_last7 */
659 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
660 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
661 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
662 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
668 #ifndef FASTER_ALIGNED
679 jmpl %o5 + %lo(80b), %g0
682 #else /* FASTER_ALIGNED */
691 jmpl %o5 + %lo(84f), %g0
694 83: /* amemcpy_table */
696 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
697 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
698 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
699 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
700 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
701 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
702 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
704 84: /* amemcpy_table_end */
710 std %g2, [%o0 - 0x08]
712 85: /* amemcpy_last7 */
738 #endif /* FASTER_ALIGNED */
740 86: /* non_aligned */
744 #ifdef FASTER_NONALIGNED
749 #endif /* FASTER_NONALIGNED */
870 #ifdef FASTER_NONALIGNED
872 87: /* faster_nonaligned */
927 and %o2, 0xffffffc0, %o3
930 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
931 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
932 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
933 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
943 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
966 and %o2, 0xffffffc0, %o3
969 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
970 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
971 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
972 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
982 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1005 and %o2, 0xffffffc0, %o3
1010 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1011 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1012 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1013 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1019 andcc %o2, 0x30, %o3
1023 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1035 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1036 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1037 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1038 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1044 andcc %o2, 0x30, %o3
1048 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1061 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1062 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1063 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1064 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1070 andcc %o2, 0x30, %o3
1074 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1086 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1087 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1088 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1089 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1095 andcc %o2, 0x30, %o3
1099 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1111 #endif /* FASTER_NONALIGNED */
1122 jmpl %o5 + %lo(89f), %g0
1125 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
1126 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
1127 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
1128 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
1129 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
1130 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
1131 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
1133 89: /* short_table_end */
1144 90: /* short_aligned_end */
1151 ld [%o1 + 0x00], %g2
1152 ld [%o1 + 0x04], %g3
1154 st %g2, [%o0 + 0x00]
1155 st %g3, [%o0 + 0x04]