1 ;; -----------------------------------------------------------------------
3 ;; Copyright 1994-2008 H. Peter Anvin - All Rights Reserved
5 ;; This program is free software; you can redistribute it and/or modify
6 ;; it under the terms of the GNU General Public License as published by
7 ;; the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 ;; Boston MA 02111-1307, USA; either version 2 of the License, or
9 ;; (at your option) any later version; incorporated herein by reference.
11 ;; -----------------------------------------------------------------------
16 ;; 32-bit bcopy routine for real mode
20 ; 32-bit bcopy routine for real mode
22 ; We enter protected mode, set up a flat 32-bit environment, run rep movsd
23 ; and then exit. IMPORTANT: This code assumes cs == 0.
25 ; This code is probably excessively anal-retentive in its handling of
26 ; segments, but this stuff is painful enough as it is without having to rely
27 ; on everything happening "as it ought to."
29 ; NOTE: this code is relocated into low memory, just after the .earlybss
30 ; segment, in order to support to "bcopy over self" operation.
37 ; This is in the .text segment since it needs to be
38 ; contiguous with the rest of the bcopy stuff
40 ; GDT descriptor entry
43 PM_%1 equ bcopy_gdt.%1-bcopy_gdt
47 dw bcopy_gdt_size-1 ; Null descriptor - contains GDT
48 dd bcopy_gdt ; pointer for LGDT instruction
52 dd 0000ffffh ; 08h Code segment, use16, readable,
53 dd 00009b00h ; present, dpl 0, cover 64K
55 dd 0000ffffh ; 10h Data segment, use16, read/write,
56 dd 008f9300h ; present, dpl 0, cover all 4G
58 dd 0000ffffh ; 18h Data segment, use16, read/write,
59 dd 00009300h ; present, dpl 0, cover 64K
60 ; The next two segments are used for COM32 only
62 dd 0000ffffh ; 20h Code segment, use32, readable,
63 dd 00cf9b00h ; present, dpl 0, cover all 4G
65 dd 0000ffffh ; 28h Data segment, use32, read/write,
66 dd 00cf9300h ; present, dpl 0, cover all 4G
68 ; TSS segment to keep Intel VT happy. Intel VT is
69 ; unhappy about anything that doesn't smell like a
70 ; full-blown 32-bit OS.
72 dw 104-1, DummyTSS ; 30h 32-bit task state segment
73 dd 00008900h ; present, dpl 0, 104 bytes @DummyTSS
75 ; 16-bit stack segment, which may have a different
76 ; base from DS16 (e.g. if we're booted from PXELINUX)
78 dd 0000ffffh ; 38h Data segment, use16, read/write,
79 dd 00009300h ; present, dpl 0, cover 64K
81 bcopy_gdt_size: equ $-bcopy_gdt
85 ; 32-bit copy, overlap safe
88 ; ESI - source pointer (-1 means do bzero rather than bcopy)
89 ; EDI - target pointer
94 ; ESI - first byte after source (garbage if ESI == -1 on entry)
95 ; EDI - first byte after target
107 ; This routine is used to invoke a simple routine in 16-bit protected
108 ; mode (with 32-bit DS and ES, and working 16-bit stack.)
109 ; Note that all segment registers including CS, except possibly SS,
110 ; are zero-based in the protected-mode routine.
112 ; No interrupt thunking services are provided; interrupts are disabled
113 ; for the duration of the routine. Don't run for too long at a time.
116 ; On stack - pm entrypoint
117 ; EAX, EBP preserved until real-mode exit
118 ; EBX, ECX, EDX, ESI and EDI passed to the called routine
121 ; EAX, EBP restored from real-mode entry
122 ; All other registers as returned from called function
123 ; PM entrypoint cleaned off stack
129 pushfd ; Saves, among others, the IF flag
138 mov byte [cs:bcopy_gdt.TSS+5],89h ; Mark TSS unbusy
140 ; Convert the stack segment to a base
145 mov [cs:bcopy_gdt.SS16+2],eax
147 push ss ; Save real-mode SS selector
149 o32 lgdt [cs:bcopy_gdt]
152 mov cr0,eax ; Enter protected mode
155 mov ax,PM_SS16 ; Make stack usable
158 mov al,PM_DS16_4G ; Data segment selector
162 ; Set fs, gs, tr, and ldtr in case we're on a virtual
163 ; machine running on Intel VT hardware -- it can't
164 ; deal with a partial transition, for no good reason.
166 mov al,PM_DS16_RM ; Real-mode-like segment
169 mov al,PM_TSS ; Intel VT really doesn't want
170 ltr ax ; an invalid TR and LDTR, so give
171 xor ax,ax ; it something that it can use...
174 call [bp+2*4+2] ; Call actual routine
177 mov ax,PM_DS16_RM ; "Real-mode-like" data segment
181 pop bp ; Previous value for ss
185 mov cr0,eax ; Disable protected mode
188 .in_rm: ; Back in real mode
198 popfd ; Re-enables interrupts
201 ret 2 ; Drops the pm entry
206 ; This is the protected-mode core of the "bcopy" routine.
207 ; Try to do aligned transfers; if the src and dst are relatively
208 ; misaligned, align the dst.
210 ; ECX is guaranteed to not be zero on entry.
216 cmp esi,edi ; If source < destination, we might
217 jb .reverse ; have to copy backwards
238 mov al,cl ; Save low bits
239 shr ecx,2 ; Convert to dwords
240 a32 rep movsd ; Do our business
241 ; At this point ecx == 0
257 lea esi,[esi+ecx-1] ; Point to final byte
282 mov al,cl ; Save low bits
325 mov bl,cl ; Save low bits
341 ; Routines to enable and disable (yuck) A20. These routines are gathered
342 ; from tips from a couple of sources, including the Linux kernel and
343 ; http://www.x86.org/. The need for the delay to be as large as given here
344 ; is indicated by Donnie Barnes of RedHat, the problematic system being an
345 ; IBM ThinkPad 760EL.
347 ; We typically toggle A20 twice for every 64K transferred.
349 %define io_delay call _io_delay
350 %define IO_DELAY_PORT 80h ; Invalid port (we hope!)
351 %define disable_wait 32 ; How long to wait for a disable
353 ; Note the skip of 2 here
354 %define A20_DUNNO 0 ; A20 type unknown
355 %define A20_NONE 2 ; A20 always on?
356 %define A20_BIOS 4 ; A20 BIOS enable
357 %define A20_KBC 6 ; A20 through KBC
358 %define A20_FAST 8 ; A20 through port 92h
360 slow_out: out dx, al ; Fall through
362 _io_delay: out IO_DELAY_PORT,al
368 mov byte [cs:A20Tries],255 ; Times to try to make this work
379 ; If the A20 type is known, jump straight to type
382 jmp word [cs:bp+A20List]
385 ; First, see if we are on a system with no A20 gate
389 mov byte [cs:A20Type], A20_NONE
394 ; Next, try the BIOS (INT 15h AX=2401h)
397 mov byte [cs:A20Type], A20_BIOS
399 pushf ; Some BIOSes muck with IF
407 ; Enable the keyboard controller A20 gate
410 mov dl, 1 ; Allow early exit
412 jnz a20_done ; A20 live, no need to use KBC
414 mov byte [cs:A20Type], A20_KBC ; Starting KBC command sequence
416 mov al,0D1h ; Command write
418 call empty_8042_uncond
422 call empty_8042_uncond
424 ; Verify that A20 actually is enabled. Do that by
425 ; observing a word in low memory and the same word in
426 ; the HMA until they are no longer coherent. Note that
427 ; we don't do the same check in the disable case, because
428 ; we don't want to *require* A20 masking (SYSLINUX should
429 ; work fine without it, if the BIOS does.)
439 ; Running out of options here. Final attempt: enable the "fast A20 gate"
442 mov byte [cs:A20Type], A20_FAST ; Haven't used the KBC yet
445 and al,~01h ; Don't accidentally reset the machine!
458 ; Oh bugger. A20 is not responding. Try frobbing it again; eventually give up
459 ; and report failure to the user.
463 dec byte [cs:A20Tries]
470 err_a20 db CR, LF, 'A20 gate not responding!', CR, LF, 0
474 ; A20 unmasked, proceed...
481 ; This routine tests if A20 is enabled (ZF = 0). This routine
482 ; must not destroy any register contents.
488 mov cx,0FFFFh ; HMA = segment 0FFFFh
490 mov cx,32 ; Loop count
494 io_delay ; Serialize, and fix delay
495 cmp ax,[es:A20Test+10h]
514 jmp word [cs:bp+A20DList]
518 pushf ; Some BIOSes muck with IF
521 jmp short a20d_snooze
524 ; Disable the "fast A20 gate"
530 jmp short a20d_snooze
533 ; Disable the keyboard controller A20 gate
536 call empty_8042_uncond
538 out 064h, al ; Command write
539 call empty_8042_uncond
540 mov al,0DDh ; A20 off
542 call empty_8042_uncond
543 ; Wait a bit for it to take effect
547 .delayloop: call a20_test
559 ; Routine to empty the 8042 KBC controller. If dl != 0
560 ; then we will test A20 in the loop and exit if A20 is
571 in al, 064h ; Status port
575 in al, 060h ; Read input
584 ; Execute a WBINVD instruction if possible on this CPU
595 ; This routine is used to shuffle memory around, followed by
596 ; invoking an entry point somewhere in low memory. This routine
597 ; can clobber any memory above 7C00h, we therefore have to move
598 ; necessary code into the trackbuf area before doing the copy,
599 ; and do adjustments to anything except BSS area references.
601 ; NOTE: Since PXELINUX relocates itself, put all these
602 ; references in the ".earlybss" segment.
604 ; After performing the copy, this routine resets the stack and
605 ; jumps to the specified entrypoint.
607 ; IMPORTANT: This routine does not canonicalize the stack or the
608 ; SS register. That is the responsibility of the caller.
611 ; DS:BX -> Pointer to list of (dst, src, len) pairs(*)
612 ; AX -> Number of list entries
613 ; [CS:EntryPoint] -> CS:IP to jump to
614 ; On stack - initial state (fd, ad, ds, es, fs, gs)
616 ; (*) If dst == -1, then (src, len) entry refers to a set of new
617 ; descriptors to load.
618 ; If src == -1, then the memory pointed to by (dst, len) is bzeroed;
619 ; this is handled inside the bcopy routine.
643 jmp far [cs:EntryPoint]
646 mov bx, trackbuf ; Next descriptor
648 push ecx ; Save byte count
653 div ecx ; Convert to descriptor count
659 ; This routine is chained to from shuffle_and_boot to invoke a
660 ; flat 32-bit protected mode operating system.
665 mov byte [cs:bcopy_gdt.TSS+5],89h ; Mark TSS unbusy
666 o32 lgdt [cs:bcopy_gdt]
669 mov cr0,eax ; Enter protected mode
670 jmp PM_CS32:.next ; Synchronize and go to 32-bit mode
674 lldt ax ; TR <- 0 to be nice to Intel VT
676 ltr ax ; Bogus TSS to be nice to Intel VT
678 mov es,ax ; 32-bit data segment selector
683 jmp word TrampolineBuf
687 A20List dw a20_dunno, a20_none, a20_bios, a20_kbc, a20_fast
689 A20DList dw a20d_dunno, a20d_none, a20d_bios, a20d_kbc, a20d_fast
692 A20Type dw A20_NONE ; A20 type
694 ; Total size of .bcopy32 section
695 alignb 4, db 0 ; Even number of dwords
696 __bcopy_size equ $-__bcopy_start
700 EntryPoint resd 1 ; CS:IP for shuffle_and_boot
701 A20Test resw 1 ; Counter for testing status of A20
702 A20Tries resb 1 ; Times until giving up on A20
705 ; This buffer contains synthesized code for shuffle-and-boot.
706 ; For the PM case, it is 9*5 = 45 bytes long; for the RM case it is
707 ; 8*6 to set the GPRs, 6*5 to set the segment registers (including a dummy
708 ; setting of CS), 5 bytes to set CS:IP, for a total of 83 bytes.
710 TrampolineBuf resb 83 ; Shuffle and boot trampoline
713 ; Space for a dummy task state segment. It should never be actually
714 ; accessed, but just in case it is, point to a chunk of memory not used