1 /* memcpy.S: Sparc optimized memcpy and memmove code
2 * Hand optimized from GNU libc's memcpy and memmove
3 * Copyright (C) 1991,1996 Free Software Foundation
4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi)
5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
19 #undef FASTER_NONALIGNED
20 #define FASTER_ALIGNED
22 /* In kernel these functions don't return a value.
23 * One should use macros in asm/string.h for that purpose.
24 * We return 0, so that bugs are more apparent.
27 #define RETL_INSN clr %o0
35 #define FASTER_REVERSE
36 #define FASTER_NONALIGNED
37 #define FASTER_ALIGNED
39 #define SETUP_RETL mov %o0, %g6
40 #define RETL_INSN mov %g6, %o0
44 /* Both these macros have to start with exactly the same insn */
45 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
46 ldd [%src + (offset) + 0x00], %t0; \
47 ldd [%src + (offset) + 0x08], %t2; \
48 ldd [%src + (offset) + 0x10], %t4; \
49 ldd [%src + (offset) + 0x18], %t6; \
50 st %t0, [%dst + (offset) + 0x00]; \
51 st %t1, [%dst + (offset) + 0x04]; \
52 st %t2, [%dst + (offset) + 0x08]; \
53 st %t3, [%dst + (offset) + 0x0c]; \
54 st %t4, [%dst + (offset) + 0x10]; \
55 st %t5, [%dst + (offset) + 0x14]; \
56 st %t6, [%dst + (offset) + 0x18]; \
57 st %t7, [%dst + (offset) + 0x1c];
59 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
60 ldd [%src + (offset) + 0x00], %t0; \
61 ldd [%src + (offset) + 0x08], %t2; \
62 ldd [%src + (offset) + 0x10], %t4; \
63 ldd [%src + (offset) + 0x18], %t6; \
64 std %t0, [%dst + (offset) + 0x00]; \
65 std %t2, [%dst + (offset) + 0x08]; \
66 std %t4, [%dst + (offset) + 0x10]; \
67 std %t6, [%dst + (offset) + 0x18];
69 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
70 ldd [%src - (offset) - 0x10], %t0; \
71 ldd [%src - (offset) - 0x08], %t2; \
72 st %t0, [%dst - (offset) - 0x10]; \
73 st %t1, [%dst - (offset) - 0x0c]; \
74 st %t2, [%dst - (offset) - 0x08]; \
75 st %t3, [%dst - (offset) - 0x04];
77 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
78 ldd [%src - (offset) - 0x10], %t0; \
79 ldd [%src - (offset) - 0x08], %t2; \
80 std %t0, [%dst - (offset) - 0x10]; \
81 std %t2, [%dst - (offset) - 0x08];
83 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
84 ldub [%src - (offset) - 0x02], %t0; \
85 ldub [%src - (offset) - 0x01], %t1; \
86 stb %t0, [%dst - (offset) - 0x02]; \
87 stb %t1, [%dst - (offset) - 0x01];
89 /* Both these macros have to start with exactly the same insn */
90 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
91 ldd [%src - (offset) - 0x20], %t0; \
92 ldd [%src - (offset) - 0x18], %t2; \
93 ldd [%src - (offset) - 0x10], %t4; \
94 ldd [%src - (offset) - 0x08], %t6; \
95 st %t0, [%dst - (offset) - 0x20]; \
96 st %t1, [%dst - (offset) - 0x1c]; \
97 st %t2, [%dst - (offset) - 0x18]; \
98 st %t3, [%dst - (offset) - 0x14]; \
99 st %t4, [%dst - (offset) - 0x10]; \
100 st %t5, [%dst - (offset) - 0x0c]; \
101 st %t6, [%dst - (offset) - 0x08]; \
102 st %t7, [%dst - (offset) - 0x04];
104 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \
105 ldd [%src - (offset) - 0x20], %t0; \
106 ldd [%src - (offset) - 0x18], %t2; \
107 ldd [%src - (offset) - 0x10], %t4; \
108 ldd [%src - (offset) - 0x08], %t6; \
109 std %t0, [%dst - (offset) - 0x20]; \
110 std %t2, [%dst - (offset) - 0x18]; \
111 std %t4, [%dst - (offset) - 0x10]; \
112 std %t6, [%dst - (offset) - 0x08];
114 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
115 ldd [%src + (offset) + 0x00], %t0; \
116 ldd [%src + (offset) + 0x08], %t2; \
117 st %t0, [%dst + (offset) + 0x00]; \
118 st %t1, [%dst + (offset) + 0x04]; \
119 st %t2, [%dst + (offset) + 0x08]; \
120 st %t3, [%dst + (offset) + 0x0c];
122 #define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \
123 ldub [%src + (offset) + 0x00], %t0; \
124 ldub [%src + (offset) + 0x01], %t1; \
125 stb %t0, [%dst + (offset) + 0x00]; \
126 stb %t1, [%dst + (offset) + 0x01];
128 #define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
129 ldd [%src + (offset) + 0x00], %t0; \
130 ldd [%src + (offset) + 0x08], %t2; \
131 srl %t0, shir, %t5; \
132 srl %t1, shir, %t6; \
133 sll %t0, shil, %t0; \
134 or %t5, %prev, %t5; \
135 sll %t1, shil, %prev; \
137 srl %t2, shir, %t1; \
138 srl %t3, shir, %t6; \
139 sll %t2, shil, %t2; \
140 or %t1, %prev, %t1; \
141 std %t4, [%dst + (offset) + (offset2) - 0x04]; \
142 std %t0, [%dst + (offset) + (offset2) + 0x04]; \
143 sll %t3, shil, %prev; \
146 #define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \
147 ldd [%src + (offset) + 0x00], %t0; \
148 ldd [%src + (offset) + 0x08], %t2; \
149 srl %t0, shir, %t4; \
150 srl %t1, shir, %t5; \
151 sll %t0, shil, %t6; \
152 or %t4, %prev, %t0; \
153 sll %t1, shil, %prev; \
155 srl %t2, shir, %t4; \
156 srl %t3, shir, %t5; \
157 sll %t2, shil, %t6; \
158 or %t4, %prev, %t2; \
159 sll %t3, shil, %prev; \
161 std %t0, [%dst + (offset) + (offset2) + 0x00]; \
162 std %t2, [%dst + (offset) + (offset2) + 0x08];
167 #ifdef FASTER_REVERSE
169 70: /* rdword_align */
189 #endif /* FASTER_REVERSE */
193 nop ! Only bcopy returns here and it retuns void...
210 #ifndef FASTER_REVERSE
217 1: /* reverse_bytes */
229 #else /* FASTER_REVERSE */
250 andcc %g1, 0xffffff80, %g7
256 RMOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
257 RMOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
258 RMOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
259 RMOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
274 jmpl %o5 + %lo(72f), %g0
277 71: /* rmemcpy_table */
278 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
279 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
280 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
281 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
282 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
283 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
284 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
286 72: /* rmemcpy_table_end */
291 ldd [%o1 - 0x08], %g2
297 73: /* rmemcpy_last7 */
325 RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
326 RMOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
327 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
328 RMOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
343 jmpl %o5 + %lo(72b), %g0
355 jmpl %o5 + %lo(76f), %g0
358 RMOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
359 RMOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
360 RMOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
361 RMOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
362 RMOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
363 RMOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
364 RMOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
366 76: /* rshort_table_end */
376 91: /* rshort_aligned_end */
394 77: /* rnon_aligned */
515 #endif /* FASTER_REVERSE */
517 /* NOTE: This code is executed just for the cases,
518 where %src (=%o1) & 3 is != 0.
519 We need to align it to 4. So, for (%src & 3)
520 1 we need to do ldub,lduh
523 so even if it looks weird, the branches
524 are correct here. -jj
526 78: /* dword_align */
546 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */
572 andcc %g1, 0xffffff80, %g7
578 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
579 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
580 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
581 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
596 jmpl %o5 + %lo(80f), %g0
599 79: /* memcpy_table */
601 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
602 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
603 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
604 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
605 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
606 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
607 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
609 80: /* memcpy_table_end */
619 81: /* memcpy_last7 */
647 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5)
648 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5)
649 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5)
650 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5)
656 #ifndef FASTER_ALIGNED
667 jmpl %o5 + %lo(80b), %g0
670 #else /* FASTER_ALIGNED */
679 jmpl %o5 + %lo(84f), %g0
682 83: /* amemcpy_table */
684 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5)
685 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5)
686 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5)
687 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5)
688 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5)
689 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5)
690 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5)
692 84: /* amemcpy_table_end */
698 std %g2, [%o0 - 0x08]
700 85: /* amemcpy_last7 */
726 #endif /* FASTER_ALIGNED */
728 86: /* non_aligned */
732 #ifdef FASTER_NONALIGNED
737 #endif /* FASTER_NONALIGNED */
858 #ifdef FASTER_NONALIGNED
860 87: /* faster_nonaligned */
915 and %o2, 0xffffffc0, %o3
918 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
919 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
920 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
921 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
931 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
954 and %o2, 0xffffffc0, %o3
957 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
958 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
959 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
960 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
970 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
993 and %o2, 0xffffffc0, %o3
998 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
999 SMOVE_CHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1000 SMOVE_CHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1001 SMOVE_CHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1007 andcc %o2, 0x30, %o3
1011 SMOVE_CHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, -1)
1023 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1024 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1025 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1026 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1032 andcc %o2, 0x30, %o3
1036 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 8, 24, -3)
1049 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1050 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1051 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1052 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1058 andcc %o2, 0x30, %o3
1062 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 24, 8, 3)
1074 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1075 SMOVE_ALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1076 SMOVE_ALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1077 SMOVE_ALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1083 andcc %o2, 0x30, %o3
1087 SMOVE_ALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5, o4, o5, g7, g1, 16, 16, -2)
1099 #endif /* FASTER_NONALIGNED */
1110 jmpl %o5 + %lo(89f), %g0
1113 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3)
1114 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3)
1115 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3)
1116 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3)
1117 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3)
1118 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3)
1119 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3)
1121 89: /* short_table_end */
1132 90: /* short_aligned_end */
1139 ld [%o1 + 0x00], %g2
1140 ld [%o1 + 0x04], %g3
1142 st %g2, [%o0 + 0x00]
1143 st %g3, [%o0 + 0x04]