2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/platform/pc32/i386/bcopy.s,v 1.10 2008/05/09 06:35:11 dillon Exp $
37 * bcopy(source:%esi, target:%edi, count:%ecx)
39 * note: esi, edi, eax, ecx, and edx may be destroyed
44 #include <machine/asmacros.h>
45 #include <machine/cputypes.h>
46 #include <machine/pmap.h>
47 #include <machine/specialreg.h>
54 * bcopyb() is a 'dumb' byte-granular bcopy. It is only used by
55 * devices which need to bcopy device-mapped memory which cannot
56 * otherwise handle 16 or 32 bit ops.
67 cmpl %ecx
,%eax
/* overlapping && src < dst? */
69 cld
/* nope, copy forwards */
78 addl
%ecx
,%edi
/* copy backwards. */
91 * bcopyi(s, d, len) (NON OVERLAPPING)
93 * This is a dumb 32-bit-granular bcopy
111 * If memcpy/bcopy is called as part of a copyin or copyout, the
112 * on-fault routine is set up to do a 'ret'. We have to restore
113 * %ebx and return to the copyin/copyout fault handler.
117 addl $
4,%esp
/* skip normal return vector */
118 ret
/* return to copyin/copyout fault handler */
121 * GENERIC BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
123 * Reasonably optimal on all modern machines.
127 ENTRY
(asm_generic_memcpy
) /* memcpy() entry point use optimal copy */
129 pushl $generic_onfault
133 ENTRY
(asm_generic_bcopy
)
135 pushl $generic_onfault
136 cmpl %esi
,%edi
/* if (edi < esi) fwd copy ok */
139 cmpl %esi
,%edi
/* if (edi < esi + count) do bkwrds copy */
178 * GENERIC_BCOPY() - BACKWARDS COPY
222 * MMX BCOPY() - COPY DIRECTION CHECK AND FORWARDS COPY
224 * note: esi, edi, eax, ecx, and edx are allowed to be destroyed.
226 * In order for the kernel to be able to use the FPU:
228 * (1) The kernel may not already be using the fpu.
230 * (2) If the fpu is owned by the application, we must save
231 * its state. If the fpu is not owned by the application
232 * the application's saved fp state may already exist
235 * (3) We cannot allow the kernel to overwrite the application's
236 * FPU state with our own, so we make sure the application's
237 * FPU state has been saved and then point TD_SAVEFPU at a
238 * temporary fpu save area in the globaldata structure.
242 * If gd_npxthread is not NULL we must save the application's
243 * current FP state to the current save area and then NULL
244 * out gd_npxthread to interlock against new interruptions
245 * changing the FP state further.
247 * If gd_npxthread is NULL the FP unit is in a known 'safe'
248 * state and may be used once the new save area is installed.
250 * race(1): If an interrupt occurs just prior to calling fxsave
251 * all that happens is that fxsave gets a npxdna trap, restores
252 * the app's environment, and immediately traps, restores,
253 * and saves it again.
255 * race(2): No interrupt can safely occur after we NULL-out
256 * npxthread until we fnclex, because the kernel assumes that
257 * the FP unit is in a safe state when npxthread is NULL. It's
258 * more convenient to use a cli sequence here (it is not
259 * considered to be in the critical path), but a critical
260 * section would also work.
262 * NOTE ON FNINIT vs FNCLEX - Making the FP unit safe here is
263 * the goal. It should be sufficient to just call FNCLEX rather
264 * then having to FNINIT the entire unit.
266 * race(3): The FP unit is in a known state (because npxthread
267 * was either previously NULL or we saved and init'd and made
268 * it NULL). This is true even if we are preempted and the
269 * preempting thread uses the FP unit, because it will be
270 * fninit's again on return. ANY STATE WE SAVE TO THE FPU MAY
271 * BE DESTROYED BY PREEMPTION WHILE NPXTHREAD IS NULL! However,
272 * an interrupt occuring inbetween clts and the setting of
273 * gd_npxthread may set the TS bit again and cause the next
274 * npxdna() to panic when it sees a non-NULL gd_npxthread.
276 * We can safely set TD_SAVEFPU to point to a new uninitialized
277 * save area and then set GD_NPXTHREAD to non-NULL. If an
278 * interrupt occurs after we set GD_NPXTHREAD, all that happens
279 * is that the safe FP state gets saved and restored. We do not
280 * need to clex again.
282 * We can safely clts after setting up the new save-area, before
283 * installing gd_npxthread, even if we get preempted just after
284 * calling clts. This is because the FP unit will be in a safe
285 * state while gd_npxthread is NULL. Setting gd_npxthread will
286 * simply lock-in that safe-state. Calling clts saves
287 * unnecessary trap overhead since we are about to use the FP
288 * unit anyway and don't need to 'restore' any state prior to
291 * MMX+XMM (SSE2): Typical on Athlons, later P4s. 128 bit media insn.
292 * MMX: Typical on XPs and P3s. 64 bit media insn.
295 #define MMX_SAVE_BLOCK(missfunc) \
298 movl MYCPU
,%eax ;
/* EAX = MYCPU */ \
299 btsl $
1,GD_FPU_LOCK
(%eax
) ; \
303 movl GD_CURTHREAD
(%eax
),%edx ;
/* EDX = CURTHREAD */ \
304 movl TD_SAVEFPU
(%edx
),%ebx ;
/* save app save area */\
305 addl $TDPRI_CRIT
,TD_PRI
(%edx
) ; \
306 cmpl $
0,GD_NPXTHREAD
(%eax
) ; \
308 fxsave
0(%ebx
) ;
/* race(1) */ \
309 movl $
0,GD_NPXTHREAD
(%eax
) ;
/* interlock intr */ \
311 fnclex ;
/* race(2) */ \
313 leal GD_SAVEFPU
(%eax
),%ecx ; \
314 movl
%ecx
,TD_SAVEFPU
(%edx
) ; \
315 orl $TDF_KERNELFP
,TD_FLAGS
(%edx
) ; \
317 movl
%edx
,GD_NPXTHREAD
(%eax
) ;
/* race(3) */ \
318 subl $TDPRI_CRIT
,TD_PRI
(%edx
) ;
/* crit_exit() */ \
319 cmpl $
0,GD_REQFLAGS
(%eax
) ; \
321 cmpl $TDPRI_CRIT
,TD_PRI
(%edx
) ; \
324 /* note: eax,ecx,edx destroyed */ \
327 movl $mmx_onfault
,(%esp
) ; \
330 * When restoring the application's FP state we must first clear
331 * npxthread to prevent further saves, then restore the pointer
332 * to the app's save area. We do not have to (and should not)
333 * restore the app's FP state now. Note that we do not have to
334 * call fnclex because our use of the FP guarentees that it is in
335 * a 'safe' state (at least for kernel use).
337 * NOTE: it is not usually safe to mess with CR0 outside of a
338 * critical section, because TS may get set by a preemptive
339 * interrupt. However, we *can* race a load/set-ts/store against
340 * an interrupt doing the same thing.
342 * WARNING! A Virtual kernel depends on CR0_TS remaining set after
343 * we use the FP unit if it asked it to be set.
346 #define MMX_RESTORE_BLOCK \
350 #define MMX_RESTORE_BLOCK2 \
352 movl GD_CURTHREAD
(%ecx
),%edx ; \
353 movl $
0,GD_NPXTHREAD
(%ecx
) ; \
354 andl $~TDF_KERNELFP
,TD_FLAGS
(%edx
) ; \
355 movl
%ebx
,TD_SAVEFPU
(%edx
) ; \
360 movl $
0,GD_FPU_LOCK
(%ecx
)
363 * xmm/mmx_onfault routine. Restore the fpu state, skip the normal
364 * return vector, and return to the caller's on-fault routine
365 * (which was pushed on the callers stack just before he called us)
374 * MXX entry points - only support 64 bit media instructions
377 ENTRY
(asm_mmx_memcpy
) /* memcpy() entry point use optimal copy */
378 MMX_SAVE_BLOCK
(asm_generic_memcpy
)
383 MMX_SAVE_BLOCK
(asm_generic_bcopy
)
384 cmpl %esi
,%edi
/* if (edi < esi) fwd copy ok */
387 cmpl %esi
,%edi
/* if (edi < esi + count) do bkwrds copy */
393 * XMM entry points - support 128 bit media instructions
396 ENTRY
(asm_xmm_memcpy
) /* memcpy() entry point use optimal copy */
397 MMX_SAVE_BLOCK
(asm_generic_memcpy
)
402 MMX_SAVE_BLOCK
(asm_generic_bcopy
)
403 cmpl %esi
,%edi
/* if (edi < esi) fwd copy ok */
406 cmpl %esi
,%edi
/* if (edi < esi + count) do bkwrds copy */
410 movl
%esi
,%eax
/* skip xmm if the data is not aligned */
421 movdqa
16(%esi
),%xmm1
422 movdqa
32(%esi
),%xmm2
423 movdqa
48(%esi
),%xmm3
424 movdqa
64(%esi
),%xmm4
425 movdqa
80(%esi
),%xmm5
426 movdqa
96(%esi
),%xmm6
427 movdqa
112(%esi
),%xmm7
428 /*prefetchnta 128(%esi) 3dNOW */
432 * movdqa or movntdq can be used.
435 movdqa
%xmm1
,16(%edi
)
436 movdqa
%xmm2
,32(%edi
)
437 movdqa
%xmm3
,48(%edi
)
438 movdqa
%xmm4
,64(%edi
)
439 movdqa
%xmm5
,80(%edi
)
440 movdqa
%xmm6
,96(%edi
)
441 movdqa
%xmm7
,112(%edi
)
459 /*prefetchnta 128(%esi) 3dNOW */
483 * GENERIC_BCOPY() - BACKWARDS COPY
485 * Don't bother using xmm optimizations, just stick with mmx.
502 /*prefetchnta -128(%esi)*/