1 /*P:700 The pagetable code, on the other hand, still shows the scars of
2 * previous encounters. It's functional, and as neat as it can be in the
3 * circumstances, but be wary, for these things are subtle and break easily.
4 * The Guest provides a virtual to physical mapping, but we can neither trust
5 * it nor use it: we verify and convert it here to point the hardware to the
6 * actual Guest pages when running the Guest. :*/
8 /* Copyright (C) Rusty Russell IBM Corporation 2006.
9 * GPL v2 and any later version */
11 #include <linux/types.h>
12 #include <linux/spinlock.h>
13 #include <linux/random.h>
14 #include <linux/percpu.h>
15 #include <asm/tlbflush.h>
18 #define PTES_PER_PAGE_SHIFT 10
19 #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
20 #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
22 static DEFINE_PER_CPU(spte_t
*, switcher_pte_pages
);
23 #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
25 static unsigned vaddr_to_pgd_index(unsigned long vaddr
)
27 return vaddr
>> (PAGE_SHIFT
+ PTES_PER_PAGE_SHIFT
);
30 /* These access the shadow versions (ie. the ones used by the CPU). */
31 static spgd_t
*spgd_addr(struct lguest
*lg
, u32 i
, unsigned long vaddr
)
33 unsigned int index
= vaddr_to_pgd_index(vaddr
);
35 if (index
>= SWITCHER_PGD_INDEX
) {
36 kill_guest(lg
, "attempt to access switcher pages");
39 return &lg
->pgdirs
[i
].pgdir
[index
];
42 static spte_t
*spte_addr(struct lguest
*lg
, spgd_t spgd
, unsigned long vaddr
)
44 spte_t
*page
= __va(spgd
.pfn
<< PAGE_SHIFT
);
45 BUG_ON(!(spgd
.flags
& _PAGE_PRESENT
));
46 return &page
[(vaddr
>> PAGE_SHIFT
) % PTES_PER_PAGE
];
49 /* These access the guest versions. */
50 static unsigned long gpgd_addr(struct lguest
*lg
, unsigned long vaddr
)
52 unsigned int index
= vaddr
>> (PAGE_SHIFT
+ PTES_PER_PAGE_SHIFT
);
53 return lg
->pgdirs
[lg
->pgdidx
].cr3
+ index
* sizeof(gpgd_t
);
56 static unsigned long gpte_addr(struct lguest
*lg
,
57 gpgd_t gpgd
, unsigned long vaddr
)
59 unsigned long gpage
= gpgd
.pfn
<< PAGE_SHIFT
;
60 BUG_ON(!(gpgd
.flags
& _PAGE_PRESENT
));
61 return gpage
+ ((vaddr
>>PAGE_SHIFT
) % PTES_PER_PAGE
) * sizeof(gpte_t
);
64 /* Do a virtual -> physical mapping on a user page. */
65 static unsigned long get_pfn(unsigned long virtpfn
, int write
)
68 unsigned long ret
= -1UL;
70 down_read(¤t
->mm
->mmap_sem
);
71 if (get_user_pages(current
, current
->mm
, virtpfn
<< PAGE_SHIFT
,
72 1, write
, 1, &page
, NULL
) == 1)
73 ret
= page_to_pfn(page
);
74 up_read(¤t
->mm
->mmap_sem
);
78 static spte_t
gpte_to_spte(struct lguest
*lg
, gpte_t gpte
, int write
)
83 /* We ignore the global flag. */
84 spte
.flags
= (gpte
.flags
& ~_PAGE_GLOBAL
);
85 pfn
= get_pfn(gpte
.pfn
, write
);
87 kill_guest(lg
, "failed to get page %u", gpte
.pfn
);
88 /* Must not put_page() bogus page on cleanup. */
95 static void release_pte(spte_t pte
)
97 if (pte
.flags
& _PAGE_PRESENT
)
98 put_page(pfn_to_page(pte
.pfn
));
101 static void check_gpte(struct lguest
*lg
, gpte_t gpte
)
103 if ((gpte
.flags
& (_PAGE_PWT
|_PAGE_PSE
)) || gpte
.pfn
>= lg
->pfn_limit
)
104 kill_guest(lg
, "bad page table entry");
107 static void check_gpgd(struct lguest
*lg
, gpgd_t gpgd
)
109 if ((gpgd
.flags
& ~_PAGE_TABLE
) || gpgd
.pfn
>= lg
->pfn_limit
)
110 kill_guest(lg
, "bad page directory entry");
113 /* FIXME: We hold reference to pages, which prevents them from being
114 swapped. It'd be nice to have a callback when Linux wants to swap out. */
116 /* We fault pages in, which allows us to update accessed/dirty bits.
117 * Return true if we got page. */
118 int demand_page(struct lguest
*lg
, unsigned long vaddr
, int errcode
)
122 unsigned long gpte_ptr
;
126 gpgd
= mkgpgd(lgread_u32(lg
, gpgd_addr(lg
, vaddr
)));
127 if (!(gpgd
.flags
& _PAGE_PRESENT
))
130 spgd
= spgd_addr(lg
, lg
->pgdidx
, vaddr
);
131 if (!(spgd
->flags
& _PAGE_PRESENT
)) {
132 /* Get a page of PTEs for them. */
133 unsigned long ptepage
= get_zeroed_page(GFP_KERNEL
);
134 /* FIXME: Steal from self in this case? */
136 kill_guest(lg
, "out of memory allocating pte page");
139 check_gpgd(lg
, gpgd
);
140 spgd
->raw
.val
= (__pa(ptepage
) | gpgd
.flags
);
143 gpte_ptr
= gpte_addr(lg
, gpgd
, vaddr
);
144 gpte
= mkgpte(lgread_u32(lg
, gpte_ptr
));
147 if (!(gpte
.flags
& _PAGE_PRESENT
))
150 /* Write to read-only page? */
151 if ((errcode
& 2) && !(gpte
.flags
& _PAGE_RW
))
154 /* User access to a non-user page? */
155 if ((errcode
& 4) && !(gpte
.flags
& _PAGE_USER
))
158 check_gpte(lg
, gpte
);
159 gpte
.flags
|= _PAGE_ACCESSED
;
161 gpte
.flags
|= _PAGE_DIRTY
;
163 /* We're done with the old pte. */
164 spte
= spte_addr(lg
, *spgd
, vaddr
);
167 /* We don't make it writable if this isn't a write: later
168 * write will fault so we can set dirty bit in guest. */
169 if (gpte
.flags
& _PAGE_DIRTY
)
170 *spte
= gpte_to_spte(lg
, gpte
, 1);
172 gpte_t ro_gpte
= gpte
;
173 ro_gpte
.flags
&= ~_PAGE_RW
;
174 *spte
= gpte_to_spte(lg
, ro_gpte
, 0);
177 /* Now we update dirty/accessed on guest. */
178 lgwrite_u32(lg
, gpte_ptr
, gpte
.raw
.val
);
182 /* This is much faster than the full demand_page logic. */
183 static int page_writable(struct lguest
*lg
, unsigned long vaddr
)
188 spgd
= spgd_addr(lg
, lg
->pgdidx
, vaddr
);
189 if (!(spgd
->flags
& _PAGE_PRESENT
))
192 flags
= spte_addr(lg
, *spgd
, vaddr
)->flags
;
193 return (flags
& (_PAGE_PRESENT
|_PAGE_RW
)) == (_PAGE_PRESENT
|_PAGE_RW
);
196 void pin_page(struct lguest
*lg
, unsigned long vaddr
)
198 if (!page_writable(lg
, vaddr
) && !demand_page(lg
, vaddr
, 2))
199 kill_guest(lg
, "bad stack page %#lx", vaddr
);
202 static void release_pgd(struct lguest
*lg
, spgd_t
*spgd
)
204 if (spgd
->flags
& _PAGE_PRESENT
) {
206 spte_t
*ptepage
= __va(spgd
->pfn
<< PAGE_SHIFT
);
207 for (i
= 0; i
< PTES_PER_PAGE
; i
++)
208 release_pte(ptepage
[i
]);
209 free_page((long)ptepage
);
214 static void flush_user_mappings(struct lguest
*lg
, int idx
)
217 for (i
= 0; i
< vaddr_to_pgd_index(lg
->page_offset
); i
++)
218 release_pgd(lg
, lg
->pgdirs
[idx
].pgdir
+ i
);
221 void guest_pagetable_flush_user(struct lguest
*lg
)
223 flush_user_mappings(lg
, lg
->pgdidx
);
226 static unsigned int find_pgdir(struct lguest
*lg
, unsigned long pgtable
)
229 for (i
= 0; i
< ARRAY_SIZE(lg
->pgdirs
); i
++)
230 if (lg
->pgdirs
[i
].cr3
== pgtable
)
235 static unsigned int new_pgdir(struct lguest
*lg
,
241 next
= random32() % ARRAY_SIZE(lg
->pgdirs
);
242 if (!lg
->pgdirs
[next
].pgdir
) {
243 lg
->pgdirs
[next
].pgdir
= (spgd_t
*)get_zeroed_page(GFP_KERNEL
);
244 if (!lg
->pgdirs
[next
].pgdir
)
247 /* There are no mappings: you'll need to re-pin */
250 lg
->pgdirs
[next
].cr3
= cr3
;
251 /* Release all the non-kernel mappings. */
252 flush_user_mappings(lg
, next
);
257 void guest_new_pagetable(struct lguest
*lg
, unsigned long pgtable
)
259 int newpgdir
, repin
= 0;
261 newpgdir
= find_pgdir(lg
, pgtable
);
262 if (newpgdir
== ARRAY_SIZE(lg
->pgdirs
))
263 newpgdir
= new_pgdir(lg
, pgtable
, &repin
);
264 lg
->pgdidx
= newpgdir
;
269 static void release_all_pagetables(struct lguest
*lg
)
273 for (i
= 0; i
< ARRAY_SIZE(lg
->pgdirs
); i
++)
274 if (lg
->pgdirs
[i
].pgdir
)
275 for (j
= 0; j
< SWITCHER_PGD_INDEX
; j
++)
276 release_pgd(lg
, lg
->pgdirs
[i
].pgdir
+ j
);
279 void guest_pagetable_clear_all(struct lguest
*lg
)
281 release_all_pagetables(lg
);
285 static void do_set_pte(struct lguest
*lg
, int idx
,
286 unsigned long vaddr
, gpte_t gpte
)
288 spgd_t
*spgd
= spgd_addr(lg
, idx
, vaddr
);
289 if (spgd
->flags
& _PAGE_PRESENT
) {
290 spte_t
*spte
= spte_addr(lg
, *spgd
, vaddr
);
292 if (gpte
.flags
& (_PAGE_DIRTY
| _PAGE_ACCESSED
)) {
293 check_gpte(lg
, gpte
);
294 *spte
= gpte_to_spte(lg
, gpte
, gpte
.flags
&_PAGE_DIRTY
);
300 void guest_set_pte(struct lguest
*lg
,
301 unsigned long cr3
, unsigned long vaddr
, gpte_t gpte
)
303 /* Kernel mappings must be changed on all top levels. */
304 if (vaddr
>= lg
->page_offset
) {
306 for (i
= 0; i
< ARRAY_SIZE(lg
->pgdirs
); i
++)
307 if (lg
->pgdirs
[i
].pgdir
)
308 do_set_pte(lg
, i
, vaddr
, gpte
);
310 int pgdir
= find_pgdir(lg
, cr3
);
311 if (pgdir
!= ARRAY_SIZE(lg
->pgdirs
))
312 do_set_pte(lg
, pgdir
, vaddr
, gpte
);
316 void guest_set_pmd(struct lguest
*lg
, unsigned long cr3
, u32 idx
)
320 if (idx
>= SWITCHER_PGD_INDEX
)
323 pgdir
= find_pgdir(lg
, cr3
);
324 if (pgdir
< ARRAY_SIZE(lg
->pgdirs
))
325 release_pgd(lg
, lg
->pgdirs
[pgdir
].pgdir
+ idx
);
328 int init_guest_pagetable(struct lguest
*lg
, unsigned long pgtable
)
330 /* We assume this in flush_user_mappings, so check now */
331 if (vaddr_to_pgd_index(lg
->page_offset
) >= SWITCHER_PGD_INDEX
)
334 lg
->pgdirs
[lg
->pgdidx
].cr3
= pgtable
;
335 lg
->pgdirs
[lg
->pgdidx
].pgdir
= (spgd_t
*)get_zeroed_page(GFP_KERNEL
);
336 if (!lg
->pgdirs
[lg
->pgdidx
].pgdir
)
341 void free_guest_pagetable(struct lguest
*lg
)
345 release_all_pagetables(lg
);
346 for (i
= 0; i
< ARRAY_SIZE(lg
->pgdirs
); i
++)
347 free_page((long)lg
->pgdirs
[i
].pgdir
);
350 /* Caller must be preempt-safe */
351 void map_switcher_in_guest(struct lguest
*lg
, struct lguest_pages
*pages
)
353 spte_t
*switcher_pte_page
= __get_cpu_var(switcher_pte_pages
);
357 /* Since switcher less that 4MB, we simply mug top pte page. */
358 switcher_pgd
.pfn
= __pa(switcher_pte_page
) >> PAGE_SHIFT
;
359 switcher_pgd
.flags
= _PAGE_KERNEL
;
360 lg
->pgdirs
[lg
->pgdidx
].pgdir
[SWITCHER_PGD_INDEX
] = switcher_pgd
;
362 /* Map our regs page over stack page. */
363 regs_pte
.pfn
= __pa(lg
->regs_page
) >> PAGE_SHIFT
;
364 regs_pte
.flags
= _PAGE_KERNEL
;
365 switcher_pte_page
[(unsigned long)pages
/PAGE_SIZE
%PTES_PER_PAGE
]
369 static void free_switcher_pte_pages(void)
373 for_each_possible_cpu(i
)
374 free_page((long)switcher_pte_page(i
));
377 static __init
void populate_switcher_pte_page(unsigned int cpu
,
378 struct page
*switcher_page
[],
382 spte_t
*pte
= switcher_pte_page(cpu
);
384 for (i
= 0; i
< pages
; i
++) {
385 pte
[i
].pfn
= page_to_pfn(switcher_page
[i
]);
386 pte
[i
].flags
= _PAGE_PRESENT
|_PAGE_ACCESSED
;
389 /* We only map this CPU's pages, so guest can't see others. */
392 /* First page (regs) is rw, second (state) is ro. */
393 pte
[i
].pfn
= page_to_pfn(switcher_page
[i
]);
394 pte
[i
].flags
= _PAGE_PRESENT
|_PAGE_ACCESSED
|_PAGE_RW
;
395 pte
[i
+1].pfn
= page_to_pfn(switcher_page
[i
+1]);
396 pte
[i
+1].flags
= _PAGE_PRESENT
|_PAGE_ACCESSED
;
399 __init
int init_pagetables(struct page
**switcher_page
, unsigned int pages
)
403 for_each_possible_cpu(i
) {
404 switcher_pte_page(i
) = (spte_t
*)get_zeroed_page(GFP_KERNEL
);
405 if (!switcher_pte_page(i
)) {
406 free_switcher_pte_pages();
409 populate_switcher_pte_page(i
, switcher_page
, pages
);
414 void free_pagetables(void)
416 free_switcher_pte_pages();