4 * Copyright (c) 1997, 1998 John S. Dyson
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Absolutely no warranty of function or purpose is made by the author
16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $
19 #include <sys/param.h>
20 #include <sys/queue.h>
21 #include <sys/systm.h>
22 #include <sys/kernel.h>
24 #include <sys/malloc.h>
25 #include <sys/sysctl.h>
26 #include <sys/vmmeter.h>
29 #include <vm/vm_object.h>
30 #include <vm/vm_page.h>
31 #include <vm/vm_map.h>
32 #include <vm/vm_kern.h>
33 #include <vm/vm_extern.h>
34 #include <vm/vm_zone.h>
36 #include <sys/spinlock2.h>
37 #include <vm/vm_page2.h>
39 static MALLOC_DEFINE(M_ZONE
, "ZONE", "Zone header");
41 #define ZONE_ERROR_INVALID 0
42 #define ZONE_ERROR_NOTFREE 1
43 #define ZONE_ERROR_ALREADYFREE 2
45 #define ZONE_ROUNDING 32
47 #define ZENTRY_FREE 0x12342378
49 long zone_burst
= 128;
51 static void *zget(vm_zone_t z
);
54 * Return an item from the specified zone. This function is non-blocking for
55 * ZONE_INTERRUPT zones.
62 globaldata_t gd
= mycpu
;
69 zerror(ZONE_ERROR_INVALID
);
71 zpcpu
= &z
->zpcpu
[gd
->gd_cpuid
];
74 * Avoid spinlock contention by allocating from a per-cpu queue
76 if (zpcpu
->zfreecnt
> 0) {
78 if (zpcpu
->zfreecnt
> 0) {
82 ("zitems_pcpu unexpectedly NULL"));
83 if (((void **)item
)[1] != (void *)ZENTRY_FREE
)
84 zerror(ZONE_ERROR_NOTFREE
);
85 ((void **)item
)[1] = NULL
;
87 zpcpu
->zitems
= ((void **) item
)[0];
98 * Per-zone spinlock for the remainder. Always load at least one
101 spin_lock(&z
->zlock
);
102 if (z
->zfreecnt
> z
->zfreemin
) {
107 KASSERT(item
!= NULL
, ("zitems unexpectedly NULL"));
108 if (((void **)item
)[1] != (void *)ZENTRY_FREE
)
109 zerror(ZONE_ERROR_NOTFREE
);
111 z
->zitems
= ((void **)item
)[0];
113 ((void **)item
)[0] = zpcpu
->zitems
;
114 zpcpu
->zitems
= item
;
116 } while (--n
> 0 && z
->zfreecnt
> z
->zfreemin
);
117 spin_unlock(&z
->zlock
);
120 spin_unlock(&z
->zlock
);
123 * PANICFAIL allows the caller to assume that the zalloc()
124 * will always succeed. If it doesn't, we panic here.
126 if (item
== NULL
&& (z
->zflags
& ZONE_PANICFAIL
))
127 panic("zalloc(%s) failed", z
->zname
);
133 * Free an item to the specified zone.
138 zfree(vm_zone_t z
, void *item
)
140 globaldata_t gd
= mycpu
;
146 zpcpu
= &z
->zpcpu
[gd
->gd_cpuid
];
149 * Avoid spinlock contention by freeing into a per-cpu queue
159 ((void **)item
)[0] = zpcpu
->zitems
;
161 if (((void **)item
)[1] == (void *)ZENTRY_FREE
)
162 zerror(ZONE_ERROR_ALREADYFREE
);
163 ((void **)item
)[1] = (void *)ZENTRY_FREE
;
165 zpcpu
->zitems
= item
;
168 if (zpcpu
->zfreecnt
< zmax
) {
174 * Hystereis, move (zmax) (calculated below) items to the pool.
177 if (zmax
> zone_burst
)
182 while (count
< zmax
) {
183 tail_item
= ((void **)tail_item
)[0];
186 zpcpu
->zitems
= ((void **)tail_item
)[0];
187 zpcpu
->zfreecnt
-= count
;
190 * Per-zone spinlock for the remainder.
192 * Also implement hysteresis by freeing a number of pcpu
195 spin_lock(&z
->zlock
);
196 ((void **)tail_item
)[0] = z
->zitems
;
198 z
->zfreecnt
+= count
;
199 spin_unlock(&z
->zlock
);
205 * This file comprises a very simple zone allocator. This is used
206 * in lieu of the malloc allocator, where needed or more optimal.
208 * Note that the initial implementation of this had coloring, and
209 * absolutely no improvement (actually perf degradation) occurred.
211 * Note also that the zones are type stable. The only restriction is
212 * that the first two longwords of a data structure can be changed
213 * between allocations. Any data that must be stable between allocations
214 * must reside in areas after the first two longwords.
216 * zinitna, zinit, zbootinit are the initialization routines.
217 * zalloc, zfree, are the allocation/free routines.
220 LIST_HEAD(zlist
, vm_zone
) zlist
= LIST_HEAD_INITIALIZER(zlist
);
221 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS
);
222 static vm_pindex_t zone_kmem_pages
, zone_kern_pages
;
223 static long zone_kmem_kvaspace
;
226 * Create a zone, but don't allocate the zone structure. If the
227 * zone had been previously created by the zone boot code, initialize
228 * various parts of the zone code.
230 * If waits are not allowed during allocation (e.g. during interrupt
231 * code), a-priori allocate the kernel virtual space, and allocate
232 * only pages when needed.
235 * z pointer to zone structure.
236 * obj pointer to VM object (opt).
238 * size size of zone entries.
239 * nentries number of zone entries allocated (only ZONE_INTERRUPT.)
240 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time.
241 * zalloc number of pages allocated when memory is needed.
243 * Note that when using ZONE_INTERRUPT, the size of the zone is limited
244 * by the nentries argument. The size of the memory allocatable is
245 * unlimited if ZONE_INTERRUPT is not set.
250 zinitna(vm_zone_t z
, vm_object_t obj
, char *name
, size_t size
,
251 long nentries
, uint32_t flags
)
256 * Only zones created with zinit() are destroyable.
258 if (z
->zflags
& ZONE_DESTROYABLE
)
259 panic("zinitna: can't create destroyable zone");
262 * NOTE: We can only adjust zsize if we previously did not
265 if ((z
->zflags
& ZONE_BOOT
) == 0) {
266 z
->zsize
= roundup2(size
, ZONE_ROUNDING
);
267 spin_init(&z
->zlock
, "zinitna");
274 lwkt_gettoken(&vm_token
);
275 LIST_INSERT_HEAD(&zlist
, z
, zlink
);
276 lwkt_reltoken(&vm_token
);
278 bzero(z
->zpcpu
, sizeof(z
->zpcpu
));
282 z
->zkmcur
= z
->zkmmax
= 0;
286 * If we cannot wait, allocate KVA space up front, and we will fill
287 * in pages as needed. This is particularly required when creating
288 * an allocation space for map entries in kernel_map, because we
289 * do not want to go into a recursion deadlock with
290 * vm_map_entry_reserve().
292 if (z
->zflags
& ZONE_INTERRUPT
) {
293 totsize
= round_page((size_t)z
->zsize
* nentries
);
294 atomic_add_long(&zone_kmem_kvaspace
, totsize
);
296 z
->zkva
= kmem_alloc_pageable(&kernel_map
, totsize
,
299 LIST_REMOVE(z
, zlink
);
303 z
->zpagemax
= totsize
/ PAGE_SIZE
;
305 z
->zobj
= vm_object_allocate(OBJT_DEFAULT
, z
->zpagemax
);
308 _vm_object_allocate(OBJT_DEFAULT
, z
->zpagemax
, obj
);
311 z
->zallocflag
= VM_ALLOC_SYSTEM
| VM_ALLOC_INTERRUPT
|
312 VM_ALLOC_NORMAL
| VM_ALLOC_RETRY
;
314 z
->zmax_pcpu
= z
->zmax
/ ncpus
/ 16;
317 * Set reasonable pcpu cache bounds. Low-memory systems
318 * might try to cache too little, large-memory systems
319 * might try to cache more than necessarsy.
321 * In particular, pvzone can wind up being excessive and
322 * waste memory unnecessarily.
324 if (z
->zmax_pcpu
< 1024)
326 if (z
->zmax_pcpu
* z
->zsize
> 16*1024*1024)
327 z
->zmax_pcpu
= 16*1024*1024 / z
->zsize
;
329 z
->zallocflag
= VM_ALLOC_NORMAL
| VM_ALLOC_SYSTEM
;
335 if (z
->zsize
> PAGE_SIZE
)
338 z
->zfreemin
= PAGE_SIZE
/ z
->zsize
;
343 * Reduce kernel_map spam by allocating in chunks of 4 pages.
348 * Populate the interrrupt zone at creation time rather than
349 * on first allocation, as this is a potentially long operation.
351 if (z
->zflags
& ZONE_INTERRUPT
) {
362 * Subroutine same as zinitna, except zone data structure is allocated
363 * automatically by malloc. This routine should normally be used, except
364 * in certain tricky startup conditions in the VM system -- then
365 * zbootinit and zinitna can be used. Zinit is the standard zone
366 * initialization call.
371 zinit(char *name
, size_t size
, long nentries
, uint32_t flags
)
375 z
= (vm_zone_t
) kmalloc(sizeof (struct vm_zone
), M_ZONE
, M_NOWAIT
);
380 if (zinitna(z
, NULL
, name
, size
, nentries
,
381 flags
& ~ZONE_DESTROYABLE
) == 0) {
386 if (flags
& ZONE_DESTROYABLE
)
387 z
->zflags
|= ZONE_DESTROYABLE
;
393 * Initialize a zone before the system is fully up. This routine should
394 * only be called before full VM startup.
396 * Called from the low level boot code only.
399 zbootinit(vm_zone_t z
, char *name
, size_t size
, void *item
, long nitems
)
403 spin_init(&z
->zlock
, "zbootinit");
404 bzero(z
->zpcpu
, sizeof(z
->zpcpu
));
409 z
->zflags
= ZONE_BOOT
;
415 bzero(item
, (size_t)nitems
* z
->zsize
);
417 for (i
= 0; i
< nitems
; i
++) {
418 ((void **)item
)[0] = z
->zitems
;
420 ((void **)item
)[1] = (void *)ZENTRY_FREE
;
423 item
= (uint8_t *)item
+ z
->zsize
;
425 z
->zfreecnt
= nitems
;
429 lwkt_gettoken(&vm_token
);
430 LIST_INSERT_HEAD(&zlist
, z
, zlink
);
431 lwkt_reltoken(&vm_token
);
435 * Release all resources owned by zone created with zinit().
440 zdestroy(vm_zone_t z
)
446 panic("zdestroy: null zone");
447 if ((z
->zflags
& ZONE_DESTROYABLE
) == 0)
448 panic("zdestroy: undestroyable zone");
450 lwkt_gettoken(&vm_token
);
451 LIST_REMOVE(z
, zlink
);
452 lwkt_reltoken(&vm_token
);
455 * Release virtual mappings, physical memory and update sysctl stats.
457 if (z
->zflags
& ZONE_INTERRUPT
) {
459 * Pages mapped via pmap_kenter() must be removed from the
460 * kernel_pmap() before calling kmem_free() to avoid issues
461 * with kernel_pmap.pm_stats.resident_count.
463 pmap_qremove(z
->zkva
, z
->zpagemax
);
464 vm_object_hold(z
->zobj
);
465 for (i
= 0; i
< z
->zpagecount
; ++i
) {
466 m
= vm_page_lookup_busy_wait(z
->zobj
, i
, TRUE
, "vmzd");
467 vm_page_unwire(m
, 0);
474 kmem_free(&kernel_map
, z
->zkva
,
475 (size_t)z
->zpagemax
* PAGE_SIZE
);
476 atomic_subtract_long(&zone_kmem_kvaspace
,
477 (size_t)z
->zpagemax
* PAGE_SIZE
);
480 * Free the backing object and physical pages.
482 vm_object_deallocate(z
->zobj
);
483 vm_object_drop(z
->zobj
);
484 atomic_subtract_long(&zone_kmem_pages
, z
->zpagecount
);
486 for (i
= 0; i
< z
->zkmcur
; i
++) {
487 kmem_free(&kernel_map
, z
->zkmvec
[i
],
488 (size_t)z
->zalloc
* PAGE_SIZE
);
489 atomic_subtract_long(&zone_kern_pages
, z
->zalloc
);
491 if (z
->zkmvec
!= NULL
)
492 kfree(z
->zkmvec
, M_ZONE
);
495 spin_uninit(&z
->zlock
);
501 * void *zalloc(vm_zone_t zone) --
502 * Returns an item from a specified zone. May not be called from a
503 * FAST interrupt or IPI function.
505 * void zfree(vm_zone_t zone, void *item) --
506 * Frees an item back to a specified zone. May not be called from a
507 * FAST interrupt or IPI function.
511 * Internal zone routine. Not to be called from external (non vm_zone) code.
528 panic("zget: null zone");
530 if (z
->zflags
& ZONE_INTERRUPT
) {
532 * Interrupt zones do not mess with the kernel_map, they
533 * simply populate an existing mapping.
535 * First reserve the required space.
537 vm_object_hold(z
->zobj
);
538 noffset
= (size_t)z
->zpagecount
* PAGE_SIZE
;
539 noffset
-= noffset
% z
->zsize
;
540 savezpc
= z
->zpagecount
;
541 if (z
->zpagecount
+ z
->zalloc
> z
->zpagemax
)
542 z
->zpagecount
= z
->zpagemax
;
544 z
->zpagecount
+= z
->zalloc
;
545 item
= (char *)z
->zkva
+ noffset
;
546 npages
= z
->zpagecount
- savezpc
;
547 nitems
= ((size_t)(savezpc
+ npages
) * PAGE_SIZE
- noffset
) /
549 atomic_add_long(&zone_kmem_pages
, npages
);
552 * Now allocate the pages. Note that we can block in the
553 * loop, so we've already done all the necessary calculations
554 * and reservations above.
556 for (i
= 0; i
< npages
; ++i
) {
559 m
= vm_page_alloc(z
->zobj
, savezpc
+ i
, z
->zallocflag
);
561 /* note: z might be modified due to blocking */
563 KKASSERT(m
->queue
== PQ_NONE
);
564 m
->valid
= VM_PAGE_BITS_ALL
;
568 zkva
= z
->zkva
+ (size_t)(savezpc
+ i
) * PAGE_SIZE
;
569 pmap_kenter(zkva
, VM_PAGE_TO_PHYS(m
));
570 bzero((void *)zkva
, PAGE_SIZE
);
572 vm_object_drop(z
->zobj
);
573 } else if (z
->zflags
& ZONE_SPECIAL
) {
575 * The special zone is the one used for vm_map_entry_t's.
576 * We have to avoid an infinite recursion in
577 * vm_map_entry_reserve() by using vm_map_entry_kreserve()
578 * instead. The map entries are pre-reserved by the kernel
579 * by vm_map_entry_reserve_cpu_init().
581 nbytes
= (size_t)z
->zalloc
* PAGE_SIZE
;
583 item
= (void *)kmem_alloc3(&kernel_map
, nbytes
,
584 VM_SUBSYS_ZALLOC
, KM_KRESERVE
);
586 /* note: z might be modified due to blocking */
588 atomic_add_long(&zone_kern_pages
, z
->zalloc
);
593 nitems
= nbytes
/ z
->zsize
;
596 * Otherwise allocate KVA from the kernel_map.
598 nbytes
= (size_t)z
->zalloc
* PAGE_SIZE
;
600 item
= (void *)kmem_alloc3(&kernel_map
, nbytes
,
601 VM_SUBSYS_ZALLOC
, 0);
603 /* note: z might be modified due to blocking */
605 atomic_add_long(&zone_kern_pages
, z
->zalloc
);
608 if (z
->zflags
& ZONE_DESTROYABLE
) {
609 if (z
->zkmcur
== z
->zkmmax
) {
611 z
->zkmmax
==0 ? 1 : z
->zkmmax
*2;
612 z
->zkmvec
= krealloc(z
->zkmvec
,
613 z
->zkmmax
* sizeof(z
->zkmvec
[0]),
616 z
->zkmvec
[z
->zkmcur
++] = (vm_offset_t
)item
;
621 nitems
= nbytes
/ z
->zsize
;
624 spin_lock(&z
->zlock
);
628 * Save one for immediate allocation
632 for (i
= 0; i
< nitems
; i
++) {
633 ((void **)item
)[0] = z
->zitems
;
635 ((void **)item
)[1] = (void *)ZENTRY_FREE
;
638 item
= (uint8_t *)item
+ z
->zsize
;
640 z
->zfreecnt
+= nitems
;
642 } else if (z
->zfreecnt
> 0) {
644 z
->zitems
= ((void **)item
)[0];
646 if (((void **)item
)[1] != (void *)ZENTRY_FREE
)
647 zerror(ZONE_ERROR_NOTFREE
);
648 ((void **) item
)[1] = NULL
;
655 spin_unlock(&z
->zlock
);
658 * A special zone may have used a kernel-reserved vm_map_entry. If
659 * so we have to be sure to recover our reserve so we don't run out.
660 * We will panic if we run out.
662 if (z
->zflags
& ZONE_SPECIAL
)
663 vm_map_entry_reserve(0);
672 sysctl_vm_zone(SYSCTL_HANDLER_ARGS
)
679 ksnprintf(tmpbuf
, sizeof(tmpbuf
),
680 "\nITEM SIZE LIMIT USED FREE REQUESTS\n");
681 error
= SYSCTL_OUT(req
, tmpbuf
, strlen(tmpbuf
));
685 lwkt_gettoken(&vm_token
);
686 LIST_FOREACH(curzone
, &zlist
, zlink
) {
694 len
= strlen(curzone
->zname
);
695 if (len
>= (sizeof(tmpname
) - 1))
696 len
= (sizeof(tmpname
) - 1);
697 for(i
= 0; i
< sizeof(tmpname
) - 1; i
++)
700 memcpy(tmpname
, curzone
->zname
, len
);
703 if (curzone
== LIST_FIRST(&zlist
)) {
707 freecnt
= curzone
->zfreecnt
;
708 znalloc
= curzone
->znalloc
;
709 for (n
= 0; n
< ncpus
; ++n
) {
710 freecnt
+= curzone
->zpcpu
[n
].zfreecnt
;
711 znalloc
+= curzone
->zpcpu
[n
].znalloc
;
714 ksnprintf(tmpbuf
+ offset
, sizeof(tmpbuf
) - offset
,
715 "%s %6.6lu, %8.8lu, %6.6lu, %6.6lu, %8.8lu\n",
716 tmpname
, curzone
->zsize
, curzone
->zmax
,
717 (curzone
->ztotal
- freecnt
),
720 len
= strlen((char *)tmpbuf
);
721 if (LIST_NEXT(curzone
, zlink
) == NULL
)
724 error
= SYSCTL_OUT(req
, tmpbuf
, len
);
729 lwkt_reltoken(&vm_token
);
733 #if defined(INVARIANTS)
744 case ZONE_ERROR_INVALID
:
745 msg
= "zone: invalid zone";
747 case ZONE_ERROR_NOTFREE
:
748 msg
= "zone: entry not free";
750 case ZONE_ERROR_ALREADYFREE
:
751 msg
= "zone: freeing free entry";
754 msg
= "zone: invalid error";
761 SYSCTL_OID(_vm
, OID_AUTO
, zone
, CTLTYPE_STRING
|CTLFLAG_RD
, \
762 NULL
, 0, sysctl_vm_zone
, "A", "Zone Info");
764 SYSCTL_LONG(_vm
, OID_AUTO
, zone_kmem_pages
,
765 CTLFLAG_RD
, &zone_kmem_pages
, 0, "Number of interrupt safe pages allocated by zone");
766 SYSCTL_LONG(_vm
, OID_AUTO
, zone_burst
,
767 CTLFLAG_RW
, &zone_burst
, 0, "Burst from depot to pcpu cache");
768 SYSCTL_LONG(_vm
, OID_AUTO
, zone_kmem_kvaspace
,
769 CTLFLAG_RD
, &zone_kmem_kvaspace
, 0, "KVA space allocated by zone");
770 SYSCTL_LONG(_vm
, OID_AUTO
, zone_kern_pages
,
771 CTLFLAG_RD
, &zone_kern_pages
, 0, "Number of non-interrupt safe pages allocated by zone");