2 Copyright © 2009, The AROS Development Team. All rights reserved.
10 #include <aros/debug.h>
12 #include <aros/libcall.h>
13 #include <proto/exec.h>
15 #define SSE_REG_SIZE 16
16 #define SSE_REG_MASK 0xF
18 #define MEMFENCE __asm__ __volatile__ ("sfence":::"memory")
19 #define MMENABLE __asm__ __volatile__ ("emms":::"memory")
21 #define __byte_memcpy(src,dst,size) \
23 __asm__ __volatile__( \
25 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
26 : "0" (dst), "1" (src), "2" (size) \
30 #define __long_memcpy(src,dst,size) \
32 __asm__ __volatile__( \
33 " rep; movsl" "\n\t" \
34 " testb $2,%b6" "\n\t" \
37 "1: testb $1,%b6" "\n\t" \
41 : "=&D" (dst), "=&S" (src), "=&c" (dummy) \
42 : "0" (dst), "1" (src), "2" (size >> 2), "q" (size) \
46 static __inline__
void __small_memcpy(const void * src
, void * dst
, ULONG size
)
48 register unsigned long int dummy
;
50 D(bug("[Exec] __byte_memcpy(%p, %p, %d)\n", src
, dst
, size
));
52 __byte_memcpy(src
, dst
, size
);
56 D(bug("[Exec] __long_memcpy(%p, %p, %d)\n", src
, dst
, size
));
58 __long_memcpy(src
, dst
, size
);
62 /*****************************************************************************
66 AROS_LH3I(void, CopyMem
,
69 AROS_LHA(CONST_APTR
, source
, A0
),
70 AROS_LHA(APTR
, dest
, A1
),
71 AROS_LHA(ULONG
, size
, D0
),
74 struct ExecBase
*, SysBase
, 104, Exec
)
77 Copy some data from one location to another in memory using
78 SSE optimised copying method if enough data.
81 source - Pointer to source area
82 dest - Pointer to destination
83 size - number of bytes to copy
88 The source and destination area are not allowed to overlap.
89 If the src isnt on a 16byte boundary it is aligned
90 first (so long as theres enough data..)
91 Copies using 4x16byte registers = 64bytes at a time.
102 ******************************************************************************/
108 ULONG lcnt
= (size
>> 6); /* size / 64 */
110 const void *src
= source
;
113 D(bug("[Exec] CopyMem(%p, %p, %d)\n", src
, dst
, size
));
115 __asm__
__volatile__ (
116 " prefetchnta (%0)\n"
117 " prefetchnta 32(%0)\n"
118 " prefetchnta 64(%0)\n"
119 " prefetchnta 96(%0)\n"
120 " prefetchnta 128(%0)\n"
121 " prefetchnta 160(%0)\n"
122 " prefetchnta 192(%0)\n"
123 " prefetchnta 256(%0)\n"
124 " prefetchnta 288(%0)\n"
128 if ((lcnt
> 0) && (size
>= (SSE_REG_SIZE
* 4)))
130 D(bug("[Exec] CopyMem: Using SSE Copy.\n"));
131 ULONG alignsize
= ((SSE_REG_SIZE
- ((IPTR
)src
& SSE_REG_MASK
)));
133 if ((((IPTR
)src
& SSE_REG_MASK
) != 0) && (((IPTR
)(dst
+ alignsize
) & SSE_REG_MASK
) == 0))
135 D(bug("[Exec] CopyMem: Aligning src to %d byte boundary (%d bytes) .. \n", SSE_REG_SIZE
, alignsize
));
137 __small_memcpy(src
, dst
, alignsize
);
140 lcnt
= (size
>> 6); /* size / 64 */
145 if ((((IPTR
)src
& SSE_REG_MASK
) == 0) && (((IPTR
)dst
& SSE_REG_MASK
) == 0))
148 # SRC and DST aligned on 16-byte boundary.
149 We can use movaps instead of movups since we meet
150 the alignment constraints (a general-protection fault
151 would be triggered otherwise)
154 for( ; lcnt
> 0; lcnt
--)
156 D(bug("[Exec] CopyMem: SSE Aligned-Copy %p to %p.\n", src
, dst
));
158 __asm__
__volatile__ (
159 " prefetchnta 320(%0)\n"
160 " prefetchnta 352(%0)\n"
161 " movaps (%0), %%xmm0\n"
162 " movaps 16(%0), %%xmm1\n"
163 " movaps 32(%0), %%xmm2\n"
164 " movaps 48(%0), %%xmm3\n"
165 " movntps %%xmm0, (%1)\n"
166 " movntps %%xmm1, 16(%1)\n"
167 " movntps %%xmm2, 32(%1)\n"
168 " movntps %%xmm3, 48(%1)\n"
170 : "r" (src
), "r" (dst
)
173 src
+= (SSE_REG_SIZE
* 4);
174 dst
+= (SSE_REG_SIZE
* 4);
177 else if (((IPTR
)dst
& SSE_REG_MASK
) == 0)
180 # SRC is unaligned and DST aligned on 16-byte boundary.
183 for( ; lcnt
> 0; lcnt
--)
185 D(bug("[Exec] CopyMem: SSE Unaligned-Copy %p to %p.\n", src
, dst
));
187 __asm__
__volatile__ (
188 " prefetchnta 320(%0)\n"
189 " prefetchnta 352(%0)\n"
190 " movups (%0), %%xmm0\n"
191 " movups 16(%0), %%xmm1\n"
192 " movups 32(%0), %%xmm2\n"
193 " movups 48(%0), %%xmm3\n"
194 " movntps %%xmm0, (%1)\n"
195 " movntps %%xmm1, 16(%1)\n"
196 " movntps %%xmm2, 32(%1)\n"
197 " movntps %%xmm3, 48(%1)\n"
199 : "r" (src
), "r" (dst
)
202 src
+= (SSE_REG_SIZE
* 4);
203 dst
+= (SSE_REG_SIZE
* 4);
211 D(bug("[Exec] CopyMem: Copy remaining %d bytes.\n", size
));
212 __small_memcpy(src
, dst
, size
);
216 FENCE Memory to re-order again since movntq is weakly-ordered ?
224 D(bug("[Exec] CopyMem: Finished.\n"));