Unleashed v1.4
[unleashed.git] / kernel / vm / vpm.c
blob01cbeeac932102f5b9c5c4b42e551f0bf3278193
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
28 * VM - generic vnode page mapping interfaces.
30 * Mechanism to provide temporary mappings to vnode pages.
31 * The typical use would be to copy/access file data.
34 #include <sys/types.h>
35 #include <sys/t_lock.h>
36 #include <sys/param.h>
37 #include <sys/sysmacros.h>
38 #include <sys/buf.h>
39 #include <sys/systm.h>
40 #include <sys/vnode.h>
41 #include <sys/mman.h>
42 #include <sys/errno.h>
43 #include <sys/cred.h>
44 #include <sys/kmem.h>
45 #include <sys/vtrace.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/thread.h>
49 #include <sys/dumphdr.h>
50 #include <sys/bitmap.h>
51 #include <sys/lgrp.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/hat.h>
55 #include <vm/as.h>
56 #include <vm/seg.h>
57 #include <vm/seg_kpm.h>
58 #include <vm/seg_map.h>
59 #include <vm/page.h>
60 #include <vm/pvn.h>
61 #include <vm/rm.h>
62 #include <vm/vpm.h>
65 #ifdef SEGKPM_SUPPORT
67 * VPM can be disabled by setting vpm_enable = 0 in
68 * /etc/system.
71 int vpm_enable = 1;
73 #else
75 int vpm_enable = 0;
77 #endif
79 #ifdef SEGKPM_SUPPORT
82 int vpm_cache_enable = 1;
83 long vpm_cache_percent = 12;
84 long vpm_cache_size;
85 int vpm_nfreelist = 0;
86 int vpmd_freemsk = 0;
88 #define VPM_S_PAD 64
89 union vpm_cpu {
90 struct {
91 int vcpu_free_ndx;
92 ulong_t vcpu_hits;
93 ulong_t vcpu_misses;
94 } vcpu;
95 char vpm_pad[VPM_S_PAD];
97 static union vpm_cpu *vpmd_cpu;
99 #define vfree_ndx vcpu.vcpu_free_ndx
101 int vpm_cachemode = VPMCACHE_LRU;
103 #define PPMTX(pp) (&(pp)->p_ilock)
105 static struct vpmap *vpmd_vpmap; /* list of vpmap structs preallocated */
106 static struct vpmfree *vpmd_free;
107 #define VPMAPMTX(vpm) (&vpm->vpm_mtx)
108 #define VPMAP2VMF(vpm) (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
109 #define VPMAP2VMF_NDX(vpm) (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
110 #define VPMP(id) (&vpmd_vpmap[id - 1])
111 #define VPMID(vpm) (uint_t)((vpm - vpmd_vpmap) + 1)
114 #ifdef DEBUG
116 struct vpm_debug {
117 int vpmd_steals;
118 int vpmd_contend;
119 int vpmd_prevpagelocked;
120 int vpmd_getpagefailed;
121 int vpmd_zerostart;
122 int vpmd_emptyfreelist;
123 int vpmd_nofreevpms;
124 } vpm_debug;
126 #define VPM_DEBUG(x) ((vpm_debug.x)++)
128 int steals;
129 int steals_mtbf = 7;
130 int contend;
131 int contend_mtbf = 127;
133 #define VPM_MTBF(v, f) (((++(v)) & (f)) != (f))
135 #else /* DEBUG */
137 #define VPM_MTBF(v, f) (1)
138 #define VPM_DEBUG(x) /* nothing */
140 #endif
143 * The vpm cache.
145 * The main purpose of having a cache here is to speed up page_lookup()
146 * operations and also provide an LRU(default) behaviour of file pages. The
147 * page_lookup() operation tends to be expensive if a page has to be
148 * reclaimed from the system page cache("cachelist"). Once we speed up the
149 * page_lookup()->page_reclaim() path then there there should be no need for
150 * this cache. The system page cache(cachelist) should effectively serve the
151 * purpose of caching file pages.
153 * This cache is very similar to segmap's smap cache. Each page in the
154 * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
155 * hash table. The page_t has a reference to the vpmap_t when cached. For a
156 * given vnode, offset the page is found by means of a page_lookup() operation.
157 * Any page which has a mapping(i.e when cached) will not be in the
158 * system 'cachelist'. Hence the page_lookup() will not have to do a
159 * page_reclaim(). That is how the cache serves to speed up page_lookup()
160 * operations.
162 * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
165 void
166 vpm_init()
168 long npages;
169 struct vpmap *vpm;
170 struct vpmfree *vpmflp;
171 int i, ndx;
172 extern void prefetch_smap_w(void *);
174 if (!kpm_enable) {
175 vpm_enable = 0;
178 if (!vpm_enable || !vpm_cache_enable) {
179 return;
183 * Set the size of the cache.
185 vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
186 if (vpm_cache_size < VPMAP_MINCACHE) {
187 vpm_cache_size = VPMAP_MINCACHE;
190 if (vpm_cache_size > VPMAP_MAXCACHE) {
191 vpm_cache_size = VPMAP_MAXCACHE;
195 * Number of freelists.
197 if (vpm_nfreelist == 0) {
198 vpm_nfreelist = max_ncpus;
199 } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
200 cmn_err(CE_WARN, "vpmap create : number of freelist "
201 "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
202 vpm_nfreelist = 2 * max_ncpus;
206 * Round it up to the next power of 2
208 if (!ISP2(vpm_nfreelist)) {
209 vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
211 vpmd_freemsk = vpm_nfreelist - 1;
214 * Use a per cpu rotor index to spread the allocations evenly
215 * across the available vpm freelists.
217 vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
218 ndx = 0;
219 for (i = 0; i < max_ncpus; i++) {
221 vpmd_cpu[i].vfree_ndx = ndx;
222 ndx = (ndx + 1) & vpmd_freemsk;
226 * Allocate and initialize the freelist.
228 vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
229 KM_SLEEP);
230 for (i = 0; i < vpm_nfreelist; i++) {
232 vpmflp = &vpmd_free[i];
234 * Set up initial queue pointers. They will get flipped
235 * back and forth.
237 vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
238 vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
241 npages = mmu_btop(vpm_cache_size);
245 * Allocate and initialize the vpmap structs. We need to
246 * walk the array backwards as the prefetch happens in reverse
247 * order.
249 vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
250 for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
251 struct vpmfree *vpmflp;
252 union vpm_freeq *releq;
253 struct vpmap *vpmapf;
256 * Use prefetch as we have to walk thru a large number of
257 * these data structures. We just use the smap's prefetch
258 * routine as it does the same.
260 prefetch_smap_w((void *)vpm);
262 vpm->vpm_vp = NULL;
263 vpm->vpm_off = 0;
264 vpm->vpm_pp = NULL;
265 vpm->vpm_refcnt = 0;
266 mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
267 vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
269 vpmflp = VPMAP2VMF(vpm);
270 releq = vpmflp->vpm_releq;
272 vpmapf = releq->vpmq_free;
273 if (vpmapf == NULL) {
274 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
275 } else {
276 vpm->vpm_next = vpmapf;
277 vpm->vpm_prev = vpmapf->vpm_prev;
278 vpmapf->vpm_prev = vpm;
279 vpm->vpm_prev->vpm_next = vpm;
280 releq->vpmq_free = vpm->vpm_next;
284 * Indicate that the vpmap is on the releq at start
286 vpm->vpm_ndxflg = VPMRELEQ;
292 * unhooks vpm from the freelist if it is still on the freelist.
294 #define VPMAP_RMFREELIST(vpm) \
296 if (vpm->vpm_next != NULL) { \
297 union vpm_freeq *freeq; \
298 struct vpmfree *vpmflp; \
299 vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
300 freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
301 mutex_enter(&freeq->vpmq_mtx); \
302 if (freeq->vpmq_free != vpm) { \
303 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
304 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
305 } else if (vpm == vpm->vpm_next) { \
306 freeq->vpmq_free = NULL; \
307 } else { \
308 freeq->vpmq_free = vpm->vpm_next; \
309 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
310 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
312 mutex_exit(&freeq->vpmq_mtx); \
313 vpm->vpm_next = vpm->vpm_prev = NULL; \
317 static int
318 get_freelndx(int mode)
320 int ndx;
322 ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
323 switch (mode) {
325 case VPMCACHE_LRU:
326 default:
327 vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
328 break;
330 return (ndx);
335 * Find one vpmap structure from the free lists and use it for the newpage.
336 * The previous page it cached is dissociated and released. The page_t's
337 * p_vpmref is cleared only when the vpm it is pointing to is locked(or
338 * for AMD64 when the page is exclusively locked in page_unload. That is
339 * because the p_vpmref is treated as mapping).
341 * The page's p_vpmref is set when the page is
342 * locked(at least SHARED locked).
344 static struct vpmap *
345 get_free_vpmap(page_t *newpage)
347 struct vpmfree *vpmflp;
348 kmutex_t *vmtx;
349 struct vpmap *vpm, *first;
350 union vpm_freeq *allocq, *releq;
351 page_t *pp = NULL;
352 int end_ndx, page_locked = 0;
353 int free_ndx;
356 * get the freelist bin index.
358 free_ndx = get_freelndx(vpm_cachemode);
360 end_ndx = free_ndx;
361 vpmflp = &vpmd_free[free_ndx];
363 retry_queue:
364 allocq = vpmflp->vpm_allocq;
365 mutex_enter(&allocq->vpmq_mtx);
367 if ((vpm = allocq->vpmq_free) == NULL) {
369 skip_queue:
371 * The alloc list is empty or this queue is being skipped;
372 * first see if the allocq toggled.
374 if (vpmflp->vpm_allocq != allocq) {
375 /* queue changed */
376 mutex_exit(&allocq->vpmq_mtx);
377 goto retry_queue;
379 releq = vpmflp->vpm_releq;
380 if (!mutex_tryenter(&releq->vpmq_mtx)) {
381 /* cannot get releq; a free vpmap may be there now */
382 mutex_exit(&allocq->vpmq_mtx);
385 * This loop could spin forever if this thread has
386 * higher priority than the thread that is holding
387 * releq->vpmq_mtx. In order to force the other thread
388 * to run, we'll lock/unlock the mutex which is safe
389 * since we just unlocked the allocq mutex.
391 mutex_enter(&releq->vpmq_mtx);
392 mutex_exit(&releq->vpmq_mtx);
393 goto retry_queue;
395 if (releq->vpmq_free == NULL) {
396 VPM_DEBUG(vpmd_emptyfreelist);
398 * This freelist is empty.
399 * This should not happen unless clients
400 * are failing to release the vpmap after
401 * accessing the data. Before resorting
402 * to sleeping, try the next list of the same color.
404 free_ndx = (free_ndx + 1) & vpmd_freemsk;
405 if (free_ndx != end_ndx) {
406 mutex_exit(&releq->vpmq_mtx);
407 mutex_exit(&allocq->vpmq_mtx);
408 vpmflp = &vpmd_free[free_ndx];
409 goto retry_queue;
412 * Tried all freelists.
413 * wait on this list and hope something gets freed.
415 vpmflp->vpm_want++;
416 mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
417 cv_wait(&vpmflp->vpm_free_cv,
418 &vpmflp->vpm_freeq[0].vpmq_mtx);
419 vpmflp->vpm_want--;
420 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
421 vpmflp = &vpmd_free[free_ndx];
422 VPM_DEBUG(vpmd_nofreevpms);
423 goto retry_queue;
424 } else {
426 * Something on the rele queue; flip the alloc
427 * and rele queues and retry.
429 vpmflp->vpm_allocq = releq;
430 vpmflp->vpm_releq = allocq;
431 mutex_exit(&allocq->vpmq_mtx);
432 mutex_exit(&releq->vpmq_mtx);
433 if (page_locked) {
434 ddi_msleep(250);
435 page_locked = 0;
437 goto retry_queue;
439 } else {
440 int gotnewvpm;
441 kmutex_t *pmtx;
442 uint_t vpmref;
445 * Fastpath the case we get the vpmap mutex
446 * on the first try.
448 first = vpm;
449 next_vpmap:
450 vmtx = VPMAPMTX(vpm);
451 if (!mutex_tryenter(vmtx)) {
453 * Another thread is trying to reclaim this slot.
454 * Skip to the next queue or vpmap.
456 if ((vpm = vpm->vpm_next) == first) {
457 goto skip_queue;
458 } else {
459 goto next_vpmap;
464 * Assign this vpm to the newpage.
466 pmtx = PPMTX(newpage);
467 gotnewvpm = 0;
468 mutex_enter(pmtx);
471 * Check if some other thread already assigned a vpm to
472 * this page.
474 if ((vpmref = newpage->p_vpmref) == 0) {
475 newpage->p_vpmref = VPMID(vpm);
476 gotnewvpm = 1;
477 } else {
478 VPM_DEBUG(vpmd_contend);
479 mutex_exit(vmtx);
481 mutex_exit(pmtx);
483 if (gotnewvpm) {
486 * At this point, we've selected the vpm. Remove vpm
487 * from its freelist. If vpm is the first one in
488 * the freelist, update the head of the freelist.
490 if (first == vpm) {
491 ASSERT(first == allocq->vpmq_free);
492 allocq->vpmq_free = vpm->vpm_next;
496 * If the head of the freelist still points to vpm,
497 * then there are no more free vpmaps in that list.
499 if (allocq->vpmq_free == vpm)
501 * Took the last one
503 allocq->vpmq_free = NULL;
504 else {
505 vpm->vpm_prev->vpm_next = vpm->vpm_next;
506 vpm->vpm_next->vpm_prev = vpm->vpm_prev;
508 mutex_exit(&allocq->vpmq_mtx);
509 vpm->vpm_prev = vpm->vpm_next = NULL;
512 * Disassociate the previous page.
513 * p_vpmref is used as a mapping reference to the page.
515 if ((pp = vpm->vpm_pp) != NULL &&
516 vpm->vpm_vp == pp->p_vnode &&
517 vpm->vpm_off == pp->p_offset) {
519 pmtx = PPMTX(pp);
520 if (page_trylock(pp, SE_SHARED)) {
522 * Now verify that it is the correct
523 * page. If not someone else stole it,
524 * so just unlock it and leave.
526 mutex_enter(pmtx);
527 if (PP_ISFREE(pp) ||
528 vpm->vpm_vp != pp->p_vnode ||
529 vpm->vpm_off != pp->p_offset ||
530 pp->p_vpmref != VPMID(vpm)) {
531 mutex_exit(pmtx);
533 page_unlock(pp);
534 } else {
536 * Release the page.
538 pp->p_vpmref = 0;
539 mutex_exit(pmtx);
540 (void) page_release(pp, 1);
542 } else {
544 * If the page cannot be locked, just
545 * clear the p_vpmref and go.
547 mutex_enter(pmtx);
548 if (pp->p_vpmref == VPMID(vpm)) {
549 pp->p_vpmref = 0;
551 mutex_exit(pmtx);
552 VPM_DEBUG(vpmd_prevpagelocked);
557 * Setup vpm to point to the new page.
559 vpm->vpm_pp = newpage;
560 vpm->vpm_vp = newpage->p_vnode;
561 vpm->vpm_off = newpage->p_offset;
563 } else {
564 int steal = !VPM_MTBF(steals, steals_mtbf);
566 * Page already has a vpm assigned just use that.
567 * Grab the vpm mutex and verify that it is still
568 * the correct one. The pp->p_vpmref should not change
569 * once we have the vpm mutex and the page lock.
571 mutex_exit(&allocq->vpmq_mtx);
572 vpm = VPMP(vpmref);
573 vmtx = VPMAPMTX(vpm);
574 mutex_enter(vmtx);
575 if ((steal && vpm->vpm_refcnt == 0) ||
576 vpm->vpm_pp != newpage) {
578 * The vpm got stolen, retry.
579 * clear the p_vpmref.
581 pmtx = PPMTX(newpage);
582 mutex_enter(pmtx);
583 if (newpage->p_vpmref == vpmref) {
584 newpage->p_vpmref = 0;
586 mutex_exit(pmtx);
588 mutex_exit(vmtx);
589 VPM_DEBUG(vpmd_steals);
590 goto retry_queue;
591 } else if (vpm->vpm_refcnt == 0) {
593 * Remove it from the free list if it
594 * exists there.
596 VPMAP_RMFREELIST(vpm);
599 return (vpm);
603 static void
604 free_vpmap(struct vpmap *vpm)
606 struct vpmfree *vpmflp;
607 struct vpmap *vpmfreelist;
608 union vpm_freeq *releq;
610 ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
612 if (vpm->vpm_refcnt != 0) {
613 panic("free_vpmap");
614 /*NOTREACHED*/
617 vpmflp = &vpmd_free[vpm->vpm_free_ndx];
619 * Add to the tail of the release queue
620 * Note that vpm_releq and vpm_allocq could toggle
621 * before we get the lock. This does not affect
622 * correctness as the 2 queues are only maintained
623 * to reduce lock pressure.
625 releq = vpmflp->vpm_releq;
626 if (releq == &vpmflp->vpm_freeq[0]) {
627 vpm->vpm_ndxflg = 0;
628 } else {
629 vpm->vpm_ndxflg = 1;
631 mutex_enter(&releq->vpmq_mtx);
632 vpmfreelist = releq->vpmq_free;
633 if (vpmfreelist == 0) {
634 int want;
636 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
638 * Both queue mutexes are held to set vpm_want;
639 * snapshot the value before dropping releq mutex.
640 * If vpm_want appears after the releq mutex is dropped,
641 * then the vpmap just freed is already gone.
643 want = vpmflp->vpm_want;
644 mutex_exit(&releq->vpmq_mtx);
646 * See if there was a waiter before dropping the releq mutex
647 * then recheck after obtaining vpm_freeq[0] mutex as
648 * the another thread may have already signaled.
650 if (want) {
651 mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
652 if (vpmflp->vpm_want)
653 cv_signal(&vpmflp->vpm_free_cv);
654 mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
656 } else {
657 vpm->vpm_next = vpmfreelist;
658 vpm->vpm_prev = vpmfreelist->vpm_prev;
659 vpmfreelist->vpm_prev = vpm;
660 vpm->vpm_prev->vpm_next = vpm;
661 mutex_exit(&releq->vpmq_mtx);
666 * Get the vpmap for the page.
667 * The refcnt of this vpm is incremented.
669 static struct vpmap *
670 get_vpmap(page_t *pp)
672 struct vpmap *vpm = NULL;
673 kmutex_t *vmtx;
674 kmutex_t *pmtx;
675 unsigned int refid;
677 ASSERT((pp != NULL) && PAGE_LOCKED(pp));
679 if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
680 vpm = VPMP(refid);
681 vmtx = VPMAPMTX(vpm);
682 mutex_enter(vmtx);
684 * Since we have the page lock and the vpm mutex, the
685 * pp->p_vpmref cannot change.
687 if (vpm->vpm_pp != pp) {
688 pmtx = PPMTX(pp);
691 * Clear the p_vpmref as it is incorrect.
692 * This can happen if the page was stolen.
693 * On x64 this should not happen as p_vpmref
694 * is treated as a mapping on the page. So
695 * if the page is stolen, the mapping would have
696 * been cleared in page_unload().
698 mutex_enter(pmtx);
699 if (pp->p_vpmref == refid)
700 pp->p_vpmref = 0;
701 mutex_exit(pmtx);
703 mutex_exit(vmtx);
704 vpm = NULL;
705 } else if (vpm->vpm_refcnt == 0) {
707 * Got the vpm, remove it from the free
708 * list if it exists there.
710 VPMAP_RMFREELIST(vpm);
713 if (vpm == NULL) {
715 * get_free_vpmap() returns with the vpmap mutex held.
717 vpm = get_free_vpmap(pp);
718 vmtx = VPMAPMTX(vpm);
719 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
720 } else {
721 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
724 vpm->vpm_refcnt++;
725 mutex_exit(vmtx);
727 return (vpm);
730 /* END --- vpm cache ---- */
733 * The vnode page mapping(vpm) interface routines.
737 * Find or create the pages starting form baseoff for specified
738 * length 'len'.
740 static int
741 vpm_pagecreate(
742 struct vnode *vp,
743 uoff_t baseoff,
744 size_t len,
745 vmap_t vml[],
746 int nseg,
747 int *newpage)
750 page_t *pp = NULL;
751 caddr_t base;
752 uoff_t off = baseoff;
753 int i;
754 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
756 for (i = 0; len > 0; len -= PAGESIZE, i++) {
757 struct vpmap *vpm;
760 if ((pp = page_lookup(&vp->v_object, off, SE_SHARED)) == NULL) {
762 base = segkpm_create_va(off);
765 * the seg pointer passed in is just advisor. Just
766 * pass segkmap for now like segmap does with
767 * segmap_kpm enabled.
769 if ((pp = page_create_va(&vp->v_object, off, PAGESIZE,
770 PG_WAIT, segkmap,
771 base)) == NULL) {
772 panic("segmap_pagecreate_vpm: "
773 "page_create failed");
774 /*NOTREACHED*/
776 if (newpage != NULL)
777 *newpage = 1;
779 page_io_unlock(pp);
783 * Get the vpm for this page_t.
785 if (vpm_cache_enable) {
786 vpm = get_vpmap(pp);
787 vml[i].vs_data = (void *)&vpm->vpm_pp;
788 } else {
789 vml[i].vs_data = (void *)pp;
790 pp->p_vpmref = 0;
793 vml[i].vs_addr = hat_kpm_mapin(pp, 0);
794 vml[i].vs_len = PAGESIZE;
796 off += PAGESIZE;
798 vml[i].vs_data = NULL;
799 vml[i].vs_addr = NULL;
800 return (0);
805 * Returns vpm mappings of pages in the range [off, off+len], where
806 * len is rounded up to the PAGESIZE boundary. The list of pages and
807 * the page addresses are returned in the SGL vml (vmap_t) array passed in.
808 * The nseg is the number of vmap_t entries in the array.
810 * The segmap's SM_LOCKPROTO usage is not supported by these interfaces.
811 * For such cases, use the seg_map interfaces.
814 vpm_map_pages(
815 struct vnode *vp,
816 uoff_t off,
817 size_t len,
818 int fetchpage,
819 vmap_t *vml,
820 int nseg,
821 int *newpage,
822 enum seg_rw rw)
824 extern struct vnode *common_specvp();
825 uoff_t baseoff;
826 uint_t prot;
827 caddr_t base;
828 page_t *pp, *pplist[MAXVMAPS];
829 struct vpmap *vpm;
830 int i, error = 0;
831 size_t tlen;
833 ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
834 baseoff = off & (offset_t)PAGEMASK;
835 vml[0].vs_data = NULL;
836 vml[0].vs_addr = NULL;
838 tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
840 * Restrict it to VPMMAXLEN.
842 if (tlen > (VPMMAXPGS * PAGESIZE)) {
843 tlen = VPMMAXPGS * PAGESIZE;
846 * Ensure length fits within the vml[] array. One element of
847 * the array is used to mark the end of the scatter/gather list
848 * of valid mappings by setting its vs_addr = NULL. Leave space
849 * for this element.
851 if (tlen > ((nseg - 1) * PAGESIZE)) {
852 tlen = ((nseg - 1) * PAGESIZE);
854 len = tlen;
857 * If this is a block device we have to be sure to use the
858 * "common" block device vnode for the mapping.
860 if (vp->v_type == VBLK)
861 vp = common_specvp(vp);
864 if (!fetchpage)
865 return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
867 for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
869 pp = page_lookup(&vp->v_object, baseoff, SE_SHARED);
872 * If we did not find the page or if this page was not
873 * in vpm cache(p_vpmref == 0), then let fop_getpage get
874 * all the pages.
875 * We need to call fop_getpage so that filesystems can do some
876 * (un)necessary tracking for sequential access.
879 if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
880 (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
881 != (P_MOD | P_REF))) {
882 int j;
883 if (pp != NULL) {
884 page_unlock(pp);
887 * If we did not find the desired set of pages,
888 * from the page cache, just call fop_getpage to get
889 * all the pages.
891 for (j = 0; j < i; j++) {
892 page_unlock(pplist[j]);
896 baseoff = off & (offset_t)PAGEMASK;
898 * Pass a dummy address as it will be required
899 * by page_create_va(). We pass segkmap as the seg
900 * as some file systems(UFS) check it.
902 base = segkpm_create_va(baseoff);
904 error = fop_getpage(vp, baseoff, tlen, &prot, pplist,
905 tlen, segkmap, base, rw, CRED(), NULL);
906 if (error) {
907 VPM_DEBUG(vpmd_getpagefailed);
908 pplist[0] = NULL;
910 break;
911 } else {
912 pplist[i] = pp;
913 baseoff += PAGESIZE;
917 if (error) {
918 for (i = 0; pplist[i] != NULL; i++) {
919 page_unlock(pplist[i]);
920 pplist[i] = NULL;
922 vml[0].vs_addr = NULL;
923 vml[0].vs_data = NULL;
924 return (error);
928 * Get the vpm's for pages.
930 for (i = 0; pplist[i] != NULL; i++) {
931 if (vpm_cache_enable) {
932 vpm = get_vpmap(pplist[i]);
933 vml[i].vs_data = (void *)&(vpm->vpm_pp);
934 } else {
935 vml[i].vs_data = (void *)pplist[i];
936 pplist[i]->p_vpmref = 0;
939 vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
940 vml[i].vs_len = PAGESIZE;
943 vml[i].vs_data = NULL;
944 vml[i].vs_addr = NULL;
946 return (0);
950 * Release the vpm mappings on the pages and unlock them.
952 void
953 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
955 int i;
956 struct vpmap *vpm;
957 kmutex_t *mtx;
958 page_t *pp;
960 for (i = 0; vml[i].vs_data != NULL; i++) {
961 ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
963 if (vpm_cache_enable) {
964 pp = *(((page_t **)vml[i].vs_data));
965 } else {
966 pp = (page_t *)vml[i].vs_data;
970 * Mark page as being modified or referenced, bacause vpm pages
971 * would not cause faults where it would be set normally.
973 if (rw == S_WRITE) {
974 hat_setrefmod(pp);
975 } else {
976 ASSERT(rw == S_READ);
977 hat_setref(pp);
980 if (vpm_cache_enable) {
981 vpm = (struct vpmap *)((char *)vml[i].vs_data
982 - offsetof(struct vpmap, vpm_pp));
983 hat_kpm_mapout(pp, 0, vml[i].vs_addr);
984 page_unlock(pp);
985 mtx = VPMAPMTX(vpm);
986 mutex_enter(mtx);
988 if (--vpm->vpm_refcnt == 0) {
989 free_vpmap(vpm);
991 mutex_exit(mtx);
992 } else {
993 hat_kpm_mapout(pp, 0, vml[i].vs_addr);
994 (void) page_release(pp, 1);
996 vml[i].vs_data = NULL;
997 vml[i].vs_addr = NULL;
1002 * Given the vp, off and the uio structure, this routine will do the
1003 * the copy (uiomove). If the last page created is partially written,
1004 * the rest of the page is zeroed out. It also zeros the beginning of
1005 * the first page till the start offset if requested(zerostart).
1006 * If pages are to be fetched, it will call the filesystem's getpage
1007 * function (fop_getpage) to get them, otherwise they will be created if
1008 * not already present in the page cache.
1011 vpm_data_copy(struct vnode *vp,
1012 uoff_t off,
1013 size_t len,
1014 struct uio *uio,
1015 int fetchpage,
1016 int *newpage,
1017 int zerostart,
1018 enum seg_rw rw)
1020 int error;
1021 struct vmap vml[MINVMAPS];
1022 enum uio_rw uiorw;
1023 int npages = 0;
1025 uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
1027 * 'off' will be the offset where the I/O starts.
1028 * We get the pages starting at the (off & PAGEMASK)
1029 * page boundary.
1031 error = vpm_map_pages(vp, off, (uint_t)len,
1032 fetchpage, vml, MINVMAPS, &npages, rw);
1034 if (newpage != NULL)
1035 *newpage = npages;
1036 if (!error) {
1037 int i, pn, slen = len;
1038 int pon = off & PAGEOFFSET;
1041 * Clear from the beginning of the page to start offset
1042 * if requested.
1044 if (!fetchpage && zerostart) {
1045 (void) kzero(vml[0].vs_addr, (uint_t)pon);
1046 VPM_DEBUG(vpmd_zerostart);
1049 for (i = 0; !error && slen > 0 &&
1050 vml[i].vs_addr != NULL; i++) {
1051 pn = (int)MIN(slen, (PAGESIZE - pon));
1052 error = uiomove(vml[i].vs_addr + pon,
1053 (long)pn, uiorw, uio);
1054 slen -= pn;
1055 pon = 0;
1059 * When new pages are created, zero out part of the
1060 * page we did not copy to.
1062 if (!fetchpage && npages &&
1063 uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1064 int nzero;
1066 pon = (uio->uio_loffset & PAGEOFFSET);
1067 nzero = PAGESIZE - pon;
1068 i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1069 (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1071 vpm_unmap_pages(vml, rw);
1073 return (error);
1077 * called to flush pages for the given vnode covering
1078 * [off, off+len] range.
1081 vpm_sync_pages(struct vnode *vp,
1082 uoff_t off,
1083 size_t len,
1084 uint_t flags)
1086 extern struct vnode *common_specvp();
1087 int bflags = 0;
1088 int error = 0;
1089 size_t psize = roundup(len, PAGESIZE);
1092 * If this is a block device we have to be sure to use the
1093 * "common" block device vnode for the mapping.
1095 if (vp->v_type == VBLK)
1096 vp = common_specvp(vp);
1098 if ((flags & ~SM_DONTNEED) != 0) {
1099 if (flags & SM_ASYNC)
1100 bflags |= B_ASYNC;
1101 if (flags & SM_INVAL)
1102 bflags |= B_INVAL;
1103 if (flags & SM_DESTROY)
1104 bflags |= (B_INVAL|B_TRUNC);
1105 if (flags & SM_FREE)
1106 bflags |= B_FREE;
1107 if (flags & SM_DONTNEED)
1108 bflags |= B_DONTNEED;
1110 error = fop_putpage(vp, off, psize, bflags, CRED(), NULL);
1113 return (error);
1117 #else /* SEGKPM_SUPPORT */
1119 /* vpm stubs */
1120 void
1121 vpm_init()
1125 /*ARGSUSED*/
1127 vpm_pagecreate(
1128 struct vnode *vp,
1129 uoff_t baseoff,
1130 size_t len,
1131 vmap_t vml[],
1132 int nseg,
1133 int *newpage)
1135 return (0);
1138 /*ARGSUSED*/
1140 vpm_map_pages(
1141 struct vnode *vp,
1142 uoff_t off,
1143 size_t len,
1144 int fetchpage,
1145 vmap_t vml[],
1146 int nseg,
1147 int *newpage,
1148 enum seg_rw rw)
1150 return (0);
1153 /*ARGSUSED*/
1155 vpm_data_copy(struct vnode *vp,
1156 uoff_t off,
1157 size_t len,
1158 struct uio *uio,
1159 int fetchpage,
1160 int *newpage,
1161 int zerostart,
1162 enum seg_rw rw)
1164 return (0);
1167 /*ARGSUSED*/
1168 void
1169 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1172 /*ARGSUSED*/
1174 vpm_sync_pages(struct vnode *vp,
1175 uoff_t off,
1176 size_t len,
1177 uint_t flags)
1179 return (0);
1181 #endif /* SEGKPM_SUPPORT */