269cedb2bc4ab98482d49ad356129ac628c45fa2
[AROS.git] / arch / x86_64-all / exec / copymem.c
blob269cedb2bc4ab98482d49ad356129ac628c45fa2
1 /*
2 Copyright © 2009-2012, The AROS Development Team. All rights reserved.
3 $Id$
5 Desc: Copy memory.
6 Lang: english
7 */
9 #define DEBUG 0
10 #include <aros/debug.h>
12 #include <aros/libcall.h>
13 #include <proto/exec.h>
15 #define SSE_REG_SIZE 16
16 #define SSE_REG_MASK 0xF
18 #define MEMFENCE __asm__ __volatile__ ("sfence":::"memory")
19 #define MMENABLE __asm__ __volatile__ ("emms":::"memory")
21 #define __byte_memcpy(src,dst,size) \
22 { \
23 __asm__ __volatile__( \
24 " rep; movsb" \
25 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
26 : "0" (dst), "1" (src), "2" (size) \
27 : "memory"); \
30 #define __long_memcpy(src,dst,size) \
31 { \
32 __asm__ __volatile__( \
33 " rep; movsl" "\n\t" \
34 " testb $2,%b6" "\n\t" \
35 " je 1f" "\n\t" \
36 " movsw" "\n" \
37 "1: testb $1,%b6" "\n\t" \
38 " je 2f" "\n\t" \
39 " movsb" "\n" \
40 "2:" \
41 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
42 : "0" (dst), "1" (src), "2" (size >> 2), "q" (size) \
43 : "memory"); \
46 static __inline__ void __small_memcpy(const void * src, void * dst, ULONG size)
48 register unsigned long int dummy;
49 if( size < 4 ) {
50 D(bug("[Exec] __byte_memcpy(%p, %p, %ld)\n", src, dst, size));
52 __byte_memcpy(src, dst, size);
54 else
56 D(bug("[Exec] __long_memcpy(%p, %p, %ld)\n", src, dst, size));
58 __long_memcpy(src, dst, size);
62 /*****************************************************************************
64 NAME */
66 AROS_LH3I(void, CopyMem,
68 /* SYNOPSIS */
69 AROS_LHA(CONST_APTR, source, A0),
70 AROS_LHA(APTR, dest, A1),
71 AROS_LHA(IPTR, size, D0),
73 /* LOCATION */
74 struct ExecBase *, SysBase, 104, Exec)
76 /* FUNCTION
77 Copy some data from one location to another in memory using
78 SSE optimised copying method if enough data.
80 INPUTS
81 source - Pointer to source area
82 dest - Pointer to destination
83 size - number of bytes to copy
85 RESULT
87 NOTES
88 The source and destination area are not allowed to overlap.
89 If the src isn't on a 16-byte boundary, it is aligned
90 first (so long as there's enough data)
91 Copies using 4x16-byte registers = 64 bytes at a time.
93 EXAMPLE
95 BUGS
97 SEE ALSO
98 CopyMemQuick()
100 INTERNALS
101 64-bit sizes are not handled yet.
103 ******************************************************************************/
105 AROS_LIBFUNC_INIT
107 if (!size) return;
109 ULONG lcnt = (size >> 6); /* size / 64 */
111 const void *src = source;
112 void *dst = dest;
114 D(bug("[Exec] CopyMem(%p, %p, %ld)\n", src, dst, size));
116 __asm__ __volatile__ (
117 " prefetchnta (%0)\n"
118 " prefetchnta 32(%0)\n"
119 " prefetchnta 64(%0)\n"
120 " prefetchnta 96(%0)\n"
121 " prefetchnta 128(%0)\n"
122 " prefetchnta 160(%0)\n"
123 " prefetchnta 192(%0)\n"
124 " prefetchnta 256(%0)\n"
125 " prefetchnta 288(%0)\n"
127 : "r" (src) );
129 if ((lcnt > 0) && (size >= (SSE_REG_SIZE * 4)))
131 D(bug("[Exec] CopyMem: Using SSE Copy.\n"));
132 ULONG alignsize = ((SSE_REG_SIZE - ((IPTR)src & SSE_REG_MASK)));
134 if ((((IPTR)src & SSE_REG_MASK) != 0) && (((IPTR)(dst + alignsize) & SSE_REG_MASK) == 0))
136 D(bug("[Exec] CopyMem: Aligning src to %d byte boundary (%d bytes) .. \n", SSE_REG_SIZE, alignsize));
138 __small_memcpy(src, dst, alignsize);
140 size -= alignsize;
141 lcnt = (size >> 6); /* size / 64 */
142 src += alignsize;
143 dst += alignsize;
145 if (lcnt > 0) {
146 if ((((IPTR)src & SSE_REG_MASK) == 0) && (((IPTR)dst & SSE_REG_MASK) == 0))
149 # SRC and DST aligned on 16-byte boundary.
150 We can use movaps instead of movups since we meet
151 the alignment constraints (a general-protection fault
152 would be triggered otherwise)
154 size -= (lcnt << 6);
155 for( ; lcnt > 0; lcnt--)
157 D(bug("[Exec] CopyMem: SSE Aligned-Copy %p to %p.\n", src, dst));
159 __asm__ __volatile__ (
160 " prefetchnta 320(%0)\n"
161 " prefetchnta 352(%0)\n"
162 " movaps (%0), %%xmm0\n"
163 " movaps 16(%0), %%xmm1\n"
164 " movaps 32(%0), %%xmm2\n"
165 " movaps 48(%0), %%xmm3\n"
166 " movntps %%xmm0, (%1)\n"
167 " movntps %%xmm1, 16(%1)\n"
168 " movntps %%xmm2, 32(%1)\n"
169 " movntps %%xmm3, 48(%1)\n"
171 : "r" (src), "r" (dst)
172 : "memory");
174 src += (SSE_REG_SIZE * 4);
175 dst += (SSE_REG_SIZE * 4);
178 else if (((IPTR)dst & SSE_REG_MASK) == 0)
181 # SRC is unaligned and DST aligned on 16-byte boundary.
183 size -= (lcnt << 6);
184 for( ; lcnt > 0; lcnt--)
186 D(bug("[Exec] CopyMem: SSE Unaligned-Copy %p to %p.\n", src, dst));
188 __asm__ __volatile__ (
189 " prefetchnta 320(%0)\n"
190 " prefetchnta 352(%0)\n"
191 " movups (%0), %%xmm0\n"
192 " movups 16(%0), %%xmm1\n"
193 " movups 32(%0), %%xmm2\n"
194 " movups 48(%0), %%xmm3\n"
195 " movntps %%xmm0, (%1)\n"
196 " movntps %%xmm1, 16(%1)\n"
197 " movntps %%xmm2, 32(%1)\n"
198 " movntps %%xmm3, 48(%1)\n"
200 : "r" (src), "r" (dst)
201 : "memory");
203 src += (SSE_REG_SIZE * 4);
204 dst += (SSE_REG_SIZE * 4);
210 if (size > 0)
212 D(bug("[Exec] CopyMem: Copy remaining %ld bytes.\n", size));
213 __small_memcpy(src, dst, size);
217 FENCE Memory to re-order again since movntq is weakly-ordered ?
219 MEMFENCE;
221 enable FPU use ?
223 MMENABLE;
225 D(bug("[Exec] CopyMem: Finished.\n"));
227 AROS_LIBFUNC_EXIT
228 } /* CopyMem */