1 #include <linux/types.h>
2 #include <linux/string.h>
3 #include <linux/sched.h>
4 #include <linux/hardirq.h>
5 #include <linux/module.h>
11 * MMX 3DNow! library helper functions
14 * We can use MMX just for prefetch in IRQ's. This may be a win.
15 * (reported so on K6-III)
16 * We should use a better code neutral filler for the short jump
17 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
18 * We also want to clobber the filler register so we don't get any
19 * register forwarding stalls on the filler.
21 * Add *user handling. Checksums are not a win with MMX on any CPU
22 * tested so far for any MMX solution figured.
24 * 22/09/2000 - Arjan van de Ven
25 * Improved for non-egineering-sample Athlons
29 void *_mmx_memcpy(void *to
, const void *from
, size_t len
)
34 if (unlikely(in_interrupt()))
35 return __memcpy(to
, from
, len
);
38 i
= len
>> 6; /* len/64 */
42 __asm__
__volatile__ (
43 "1: prefetch (%0)\n" /* This set is 28 bytes */
49 ".section .fixup, \"ax\"\n"
50 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
53 ".section __ex_table,\"a\"\n"
62 __asm__
__volatile__ (
63 "1: prefetch 320(%0)\n"
64 "2: movq (%0), %%mm0\n"
65 " movq 8(%0), %%mm1\n"
66 " movq 16(%0), %%mm2\n"
67 " movq 24(%0), %%mm3\n"
69 " movq %%mm1, 8(%1)\n"
70 " movq %%mm2, 16(%1)\n"
71 " movq %%mm3, 24(%1)\n"
72 " movq 32(%0), %%mm0\n"
73 " movq 40(%0), %%mm1\n"
74 " movq 48(%0), %%mm2\n"
75 " movq 56(%0), %%mm3\n"
76 " movq %%mm0, 32(%1)\n"
77 " movq %%mm1, 40(%1)\n"
78 " movq %%mm2, 48(%1)\n"
79 " movq %%mm3, 56(%1)\n"
80 ".section .fixup, \"ax\"\n"
81 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
84 ".section __ex_table,\"a\"\n"
88 : : "r" (from
), "r" (to
) : "memory");
95 __asm__
__volatile__ (
97 " movq 8(%0), %%mm1\n"
98 " movq 16(%0), %%mm2\n"
99 " movq 24(%0), %%mm3\n"
100 " movq %%mm0, (%1)\n"
101 " movq %%mm1, 8(%1)\n"
102 " movq %%mm2, 16(%1)\n"
103 " movq %%mm3, 24(%1)\n"
104 " movq 32(%0), %%mm0\n"
105 " movq 40(%0), %%mm1\n"
106 " movq 48(%0), %%mm2\n"
107 " movq 56(%0), %%mm3\n"
108 " movq %%mm0, 32(%1)\n"
109 " movq %%mm1, 40(%1)\n"
110 " movq %%mm2, 48(%1)\n"
111 " movq %%mm3, 56(%1)\n"
112 : : "r" (from
), "r" (to
) : "memory");
117 * Now do the tail of the block
119 __memcpy(to
, from
, len
&63);
127 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
128 * other MMX using processors do not.
131 static void fast_clear_page(void *page
)
137 __asm__
__volatile__ (
138 " pxor %%mm0, %%mm0\n" : :
141 for(i
=0;i
<4096/64;i
++)
143 __asm__
__volatile__ (
144 " movntq %%mm0, (%0)\n"
145 " movntq %%mm0, 8(%0)\n"
146 " movntq %%mm0, 16(%0)\n"
147 " movntq %%mm0, 24(%0)\n"
148 " movntq %%mm0, 32(%0)\n"
149 " movntq %%mm0, 40(%0)\n"
150 " movntq %%mm0, 48(%0)\n"
151 " movntq %%mm0, 56(%0)\n"
152 : : "r" (page
) : "memory");
155 /* since movntq is weakly-ordered, a "sfence" is needed to become
158 __asm__
__volatile__ (
164 static void fast_copy_page(void *to
, void *from
)
170 /* maybe the prefetch stuff can go before the expensive fnsave...
171 * but that is for later. -AV
173 __asm__
__volatile__ (
176 " prefetch 128(%0)\n"
177 " prefetch 192(%0)\n"
178 " prefetch 256(%0)\n"
180 ".section .fixup, \"ax\"\n"
181 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
184 ".section __ex_table,\"a\"\n"
190 for(i
=0; i
<(4096-320)/64; i
++)
192 __asm__
__volatile__ (
193 "1: prefetch 320(%0)\n"
194 "2: movq (%0), %%mm0\n"
195 " movntq %%mm0, (%1)\n"
196 " movq 8(%0), %%mm1\n"
197 " movntq %%mm1, 8(%1)\n"
198 " movq 16(%0), %%mm2\n"
199 " movntq %%mm2, 16(%1)\n"
200 " movq 24(%0), %%mm3\n"
201 " movntq %%mm3, 24(%1)\n"
202 " movq 32(%0), %%mm4\n"
203 " movntq %%mm4, 32(%1)\n"
204 " movq 40(%0), %%mm5\n"
205 " movntq %%mm5, 40(%1)\n"
206 " movq 48(%0), %%mm6\n"
207 " movntq %%mm6, 48(%1)\n"
208 " movq 56(%0), %%mm7\n"
209 " movntq %%mm7, 56(%1)\n"
210 ".section .fixup, \"ax\"\n"
211 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
214 ".section __ex_table,\"a\"\n"
218 : : "r" (from
), "r" (to
) : "memory");
222 for(i
=(4096-320)/64; i
<4096/64; i
++)
224 __asm__
__volatile__ (
225 "2: movq (%0), %%mm0\n"
226 " movntq %%mm0, (%1)\n"
227 " movq 8(%0), %%mm1\n"
228 " movntq %%mm1, 8(%1)\n"
229 " movq 16(%0), %%mm2\n"
230 " movntq %%mm2, 16(%1)\n"
231 " movq 24(%0), %%mm3\n"
232 " movntq %%mm3, 24(%1)\n"
233 " movq 32(%0), %%mm4\n"
234 " movntq %%mm4, 32(%1)\n"
235 " movq 40(%0), %%mm5\n"
236 " movntq %%mm5, 40(%1)\n"
237 " movq 48(%0), %%mm6\n"
238 " movntq %%mm6, 48(%1)\n"
239 " movq 56(%0), %%mm7\n"
240 " movntq %%mm7, 56(%1)\n"
241 : : "r" (from
), "r" (to
) : "memory");
245 /* since movntq is weakly-ordered, a "sfence" is needed to become
248 __asm__
__volatile__ (
257 * Generic MMX implementation without K7 specific streaming
260 static void fast_clear_page(void *page
)
266 __asm__
__volatile__ (
267 " pxor %%mm0, %%mm0\n" : :
270 for(i
=0;i
<4096/128;i
++)
272 __asm__
__volatile__ (
273 " movq %%mm0, (%0)\n"
274 " movq %%mm0, 8(%0)\n"
275 " movq %%mm0, 16(%0)\n"
276 " movq %%mm0, 24(%0)\n"
277 " movq %%mm0, 32(%0)\n"
278 " movq %%mm0, 40(%0)\n"
279 " movq %%mm0, 48(%0)\n"
280 " movq %%mm0, 56(%0)\n"
281 " movq %%mm0, 64(%0)\n"
282 " movq %%mm0, 72(%0)\n"
283 " movq %%mm0, 80(%0)\n"
284 " movq %%mm0, 88(%0)\n"
285 " movq %%mm0, 96(%0)\n"
286 " movq %%mm0, 104(%0)\n"
287 " movq %%mm0, 112(%0)\n"
288 " movq %%mm0, 120(%0)\n"
289 : : "r" (page
) : "memory");
296 static void fast_copy_page(void *to
, void *from
)
303 __asm__
__volatile__ (
306 " prefetch 128(%0)\n"
307 " prefetch 192(%0)\n"
308 " prefetch 256(%0)\n"
310 ".section .fixup, \"ax\"\n"
311 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
314 ".section __ex_table,\"a\"\n"
320 for(i
=0; i
<4096/64; i
++)
322 __asm__
__volatile__ (
323 "1: prefetch 320(%0)\n"
324 "2: movq (%0), %%mm0\n"
325 " movq 8(%0), %%mm1\n"
326 " movq 16(%0), %%mm2\n"
327 " movq 24(%0), %%mm3\n"
328 " movq %%mm0, (%1)\n"
329 " movq %%mm1, 8(%1)\n"
330 " movq %%mm2, 16(%1)\n"
331 " movq %%mm3, 24(%1)\n"
332 " movq 32(%0), %%mm0\n"
333 " movq 40(%0), %%mm1\n"
334 " movq 48(%0), %%mm2\n"
335 " movq 56(%0), %%mm3\n"
336 " movq %%mm0, 32(%1)\n"
337 " movq %%mm1, 40(%1)\n"
338 " movq %%mm2, 48(%1)\n"
339 " movq %%mm3, 56(%1)\n"
340 ".section .fixup, \"ax\"\n"
341 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
344 ".section __ex_table,\"a\"\n"
348 : : "r" (from
), "r" (to
) : "memory");
359 * Favour MMX for page clear and copy.
362 static void slow_zero_page(void * page
)
365 __asm__
__volatile__( \
368 : "=&c" (d0
), "=&D" (d1
)
369 :"a" (0),"1" (page
),"0" (1024)
373 void mmx_clear_page(void * page
)
375 if(unlikely(in_interrupt()))
376 slow_zero_page(page
);
378 fast_clear_page(page
);
381 static void slow_copy_page(void *to
, void *from
)
384 __asm__
__volatile__( \
387 : "=&c" (d0
), "=&D" (d1
), "=&S" (d2
) \
388 : "0" (1024),"1" ((long) to
),"2" ((long) from
) \
393 void mmx_copy_page(void *to
, void *from
)
395 if(unlikely(in_interrupt()))
396 slow_copy_page(to
, from
);
398 fast_copy_page(to
, from
);
401 EXPORT_SYMBOL(_mmx_memcpy
);
402 EXPORT_SYMBOL(mmx_clear_page
);
403 EXPORT_SYMBOL(mmx_copy_page
);