4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
42 * This file supplies vm support for the vnode operations that deal with pages.
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
51 #include <sys/vnode.h>
53 #include <sys/vmsystm.h>
59 #include <sys/cmn_err.h>
60 #include <sys/debug.h>
61 #include <sys/cpuvar.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
71 #include <vm/seg_map.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/fs/swapnode.h>
75 int pvn_nofodklust
= 0;
76 int pvn_write_noklust
= 0;
78 uint_t pvn_vmodsort_supported
= 0; /* set if HAT supports VMODSORT */
79 uint_t pvn_vmodsort_disable
= 0; /* set in /etc/system to disable HAT */
80 /* support for vmodsort for testing */
82 static struct kmem_cache
*marker_cache
= NULL
;
85 * Find the largest contiguous block which contains `addr' for file offset
86 * `offset' in it while living within the file system block sizes (`vp_off'
87 * and `vp_len') and the address space limits for which no pages currently
88 * exist and which map to consecutive file offsets.
96 u_offset_t
*offp
, /* return values */
97 size_t *lenp
, /* return values */
102 ssize_t deltaf
, deltab
;
104 page_t
*plist
= NULL
;
108 ASSERT(off
>= vp_off
&& off
< vp_off
+ vp_len
);
111 * We only want to do klustering/read ahead if there
112 * is more than minfree pages currently available.
114 pagesavail
= freemem
- minfree
;
118 return ((page_t
*)NULL
); /* ra case - give up */
120 pagesavail
= 1; /* must return a page */
122 /* We calculate in pages instead of bytes due to 32-bit overflows */
123 if (pagesavail
< (spgcnt_t
)btopr(vp_len
)) {
125 * Don't have enough free memory for the
126 * max request, try sizing down vp request.
128 deltab
= (ssize_t
)(off
- vp_off
);
131 if (pagesavail
< btopr(vp_len
)) {
133 * Still not enough memory, just settle for
134 * pagesavail which is at least 1.
136 vp_len
= ptob(pagesavail
);
140 vp_end
= vp_off
+ vp_len
;
141 ASSERT(off
>= vp_off
&& off
< vp_end
);
143 if (isra
&& SEGOP_KLUSTER(seg
, addr
, 0))
144 return ((page_t
*)NULL
); /* segment driver says no */
146 if ((plist
= page_create_va(vp
, off
,
147 PAGESIZE
, PG_EXCL
| PG_WAIT
, seg
, addr
)) == NULL
)
148 return ((page_t
*)NULL
);
150 if (vp_len
<= PAGESIZE
|| pvn_nofodklust
) {
152 *lenp
= MIN(vp_len
, PAGESIZE
);
155 * Scan back from front by incrementing "deltab" and
156 * comparing "off" with "vp_off + deltab" to avoid
157 * "signed" versus "unsigned" conversion problems.
159 for (deltab
= PAGESIZE
; off
>= vp_off
+ deltab
;
160 deltab
+= PAGESIZE
) {
162 * Call back to the segment driver to verify that
163 * the klustering/read ahead operation makes sense.
165 if (SEGOP_KLUSTER(seg
, addr
, -deltab
))
166 break; /* page not eligible */
167 if ((pp
= page_create_va(vp
, off
- deltab
,
168 PAGESIZE
, PG_EXCL
, seg
, addr
- deltab
))
170 break; /* already have the page */
172 * Add page to front of page list.
174 page_add(&plist
, pp
);
178 /* scan forward from front */
179 for (deltaf
= PAGESIZE
; off
+ deltaf
< vp_end
;
180 deltaf
+= PAGESIZE
) {
182 * Call back to the segment driver to verify that
183 * the klustering/read ahead operation makes sense.
185 if (SEGOP_KLUSTER(seg
, addr
, deltaf
))
186 break; /* page not file extension */
187 if ((pp
= page_create_va(vp
, off
+ deltaf
,
188 PAGESIZE
, PG_EXCL
, seg
, addr
+ deltaf
))
190 break; /* already have page */
193 * Add page to end of page list.
195 page_add(&plist
, pp
);
196 plist
= plist
->p_next
;
198 *offp
= off
= off
- deltab
;
199 *lenp
= deltab
+ deltaf
;
200 ASSERT(off
>= vp_off
);
203 * If we ended up getting more than was actually
204 * requested, retract the returned length to only
205 * reflect what was requested. This might happen
206 * if we were allowed to kluster pages across a
207 * span of (say) 5 frags, and frag size is less
208 * than PAGESIZE. We need a whole number of
209 * pages to contain those frags, but the returned
210 * size should only allow the returned range to
211 * extend as far as the end of the frags.
213 if ((vp_off
+ vp_len
) < (off
+ *lenp
)) {
214 ASSERT(vp_end
> off
);
215 *lenp
= vp_end
- off
;
218 TRACE_3(TR_FAC_VM
, TR_PVN_READ_KLUSTER
,
219 "pvn_read_kluster:seg %p addr %x isra %x",
225 * Handle pages for this vnode on either side of the page "pp"
226 * which has been locked by the caller. This routine will also
227 * do klustering in the range [vp_off, vp_off + vp_len] up
228 * until a page which is not found. The offset and length
229 * of pages included is returned in "*offp" and "*lenp".
231 * Returns a list of dirty locked pages all ready to be
238 u_offset_t
*offp
, /* return values */
239 size_t *lenp
, /* return values */
246 size_t deltab
, deltaf
;
253 * Kustering should not be done if we are invalidating
254 * pages since we could destroy pages that belong to
255 * some other process if this is a swap vnode.
257 if (pvn_write_noklust
|| ((flags
& B_INVAL
) && IS_SWAPVP(vp
))) {
263 if (flags
& (B_FREE
| B_INVAL
))
270 * Scan backwards looking for pages to kluster by incrementing
271 * "deltab" and comparing "off" with "vp_off + deltab" to
272 * avoid "signed" versus "unsigned" conversion problems.
274 for (deltab
= PAGESIZE
; off
>= vp_off
+ deltab
; deltab
+= PAGESIZE
) {
275 pp
= page_lookup_nowait(vp
, off
- deltab
, se
);
277 break; /* page not found */
278 if (pvn_getdirty(pp
, flags
| B_DELWRI
) == 0)
280 page_add(&dirty
, pp
);
284 vp_end
= vp_off
+ vp_len
;
285 /* now scan forwards looking for pages to kluster */
286 for (deltaf
= PAGESIZE
; off
+ deltaf
< vp_end
; deltaf
+= PAGESIZE
) {
287 pp
= page_lookup_nowait(vp
, off
+ deltaf
, se
);
289 break; /* page not found */
290 if (pvn_getdirty(pp
, flags
| B_DELWRI
) == 0)
292 page_add(&dirty
, pp
);
293 dirty
= dirty
->p_next
;
296 *offp
= off
- deltab
;
297 *lenp
= deltab
+ deltaf
;
302 * Generic entry point used to release the "shared/exclusive" lock
303 * and the "p_iolock" on pages after i/o is complete.
306 pvn_io_done(page_t
*plist
)
310 while (plist
!= NULL
) {
312 page_sub(&plist
, pp
);
319 * Entry point to be used by file system getpage subr's and
320 * other such routines which either want to unlock pages (B_ASYNC
321 * request) or destroy a list of pages if an error occurred.
324 pvn_read_done(page_t
*plist
, int flags
)
328 while (plist
!= NULL
) {
330 page_sub(&plist
, pp
);
332 if (flags
& B_ERROR
) {
333 /*LINTED: constant in conditional context*/
334 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
336 (void) page_release(pp
, 0);
343 * When memory gets tight, start freeing pages popping out of the
347 pgcnt_t pages_before_pager
= 200; /* LMXXX */
350 * Routine to be called when page-out's complete.
351 * The caller, typically VOP_PUTPAGE, has to explicity call this routine
352 * after waiting for i/o to complete (biowait) to free the list of
353 * pages associated with the buffer. These pages must be locked
354 * before i/o is initiated.
356 * If a write error occurs, the pages are marked as modified
357 * so the write will be re-tried later.
361 pvn_write_done(page_t
*plist
, int flags
)
375 struct vnode
*vp
= NULL
; /* for probe */
377 kmutex_t
*vphm
= NULL
;
379 ASSERT((flags
& B_READ
) == 0);
382 * If we are about to start paging anyway, start freeing pages.
384 if (write_free
&& freemem
< lotsfree
+ pages_before_pager
&&
385 (flags
& B_ERROR
) == 0) {
390 * Handle each page involved in the i/o operation.
392 while (plist
!= NULL
) {
394 ASSERT(PAGE_LOCKED(pp
) && page_iolock_assert(pp
));
395 page_sub(&plist
, pp
);
397 /* Kernel probe support */
401 if (((flags
& B_ERROR
) == 0) && IS_VMODSORT(vp
)) {
403 * Move page to the top of the v_page list.
404 * Skip pages modified during IO.
406 vphm
= page_vnode_mutex(vp
);
408 if ((pp
->p_vpnext
!= pp
) && !hat_ismod(pp
)) {
409 page_vpsub(&vp
->v_pages
, pp
);
410 page_vpadd(&vp
->v_pages
, pp
);
415 if (flags
& B_ERROR
) {
417 * Write operation failed. We don't want
418 * to destroy (or free) the page unless B_FORCE
419 * is set. We set the mod bit again and release
420 * all locks on the page so that it will get written
421 * back again later when things are hopefully
423 * If B_INVAL and B_FORCE is set we really have
424 * to destroy the page.
426 if ((flags
& (B_INVAL
|B_FORCE
)) == (B_INVAL
|B_FORCE
)) {
428 /*LINTED: constant in conditional context*/
429 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
435 } else if (flags
& B_INVAL
) {
437 * XXX - Failed writes with B_INVAL set are
438 * not handled appropriately.
441 /*LINTED: constant in conditional context*/
442 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
443 } else if (flags
& B_FREE
||!hat_page_is_mapped(pp
)) {
445 * Update statistics for pages being paged out
448 if (IS_SWAPFSVP(pp
->p_vnode
)) {
451 if (pp
->p_vnode
->v_flag
& VVMEXEC
) {
461 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_OUT
,
462 "page_ws_out:pp %p", pp
);
465 * The page_struct_lock need not be acquired to
466 * examine "p_lckcnt" and "p_cowcnt" since we'll
467 * have an "exclusive" lock if the upgrade succeeds.
469 if (page_tryupgrade(pp
) &&
470 pp
->p_lckcnt
== 0 && pp
->p_cowcnt
== 0) {
472 * Check if someone has reclaimed the
473 * page. If ref and mod are not set, no
474 * one is using it so we can free it.
475 * The rest of the system is careful
476 * to use the NOSYNC flag to unload
477 * translations set up for i/o w/o
478 * affecting ref and mod bits.
480 * Obtain a copy of the real hardware
481 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
482 * to avoid having to flush the cache.
484 ppattr
= hat_pagesync(pp
, HAT_SYNC_DONTZERO
|
485 HAT_SYNC_STOPON_MOD
);
487 if (!(ppattr
& (P_REF
| P_MOD
))) {
488 if (hat_page_is_mapped(pp
)) {
490 * Doesn't look like the page
491 * was modified so now we
492 * really have to unload the
493 * translations. Meanwhile
494 * another CPU could've
495 * modified it so we have to
496 * check again. We don't loop
497 * forever here because now
498 * the translations are gone
499 * and no one can get a new one
500 * since we have the "exclusive"
503 (void) hat_pageunload(pp
,
505 ppattr
= hat_page_getattr(pp
,
510 * Update statistics for pages being
514 if (IS_SWAPFSVP(pp
->p_vnode
)) {
517 if (pp
->p_vnode
->v_flag
525 /*LINTED: constant in conditional ctx*/
526 VN_DISPOSE(pp
, B_FREE
,
527 (flags
& B_DONTNEED
), kcred
);
532 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_FREE
,
533 "page_ws_free:pp %p", pp
);
537 * Page is either `locked' in memory
538 * or was reclaimed and now has a
539 * "shared" lock, so release it.
545 * Neither B_FREE nor B_INVAL nor B_ERROR.
546 * Just release locks.
554 cpup
= CPU
; /* get cpup now that CPU cannot change */
555 CPU_STATS_ADDQ(cpup
, vm
, dfree
, dfree
);
556 CPU_STATS_ADDQ(cpup
, vm
, pgrec
, pgrec
);
557 CPU_STATS_ADDQ(cpup
, vm
, pgout
, pgout
);
558 CPU_STATS_ADDQ(cpup
, vm
, pgpgout
, pgpgout
);
559 CPU_STATS_ADDQ(cpup
, vm
, anonpgout
, anonpgout
);
560 CPU_STATS_ADDQ(cpup
, vm
, anonfree
, anonfree
);
561 CPU_STATS_ADDQ(cpup
, vm
, fspgout
, fspgout
);
562 CPU_STATS_ADDQ(cpup
, vm
, fsfree
, fsfree
);
563 CPU_STATS_ADDQ(cpup
, vm
, execpgout
, execpgout
);
564 CPU_STATS_ADDQ(cpup
, vm
, execfree
, execfree
);
568 TNF_PROBE_4(pageout
, "vm pageio io", /* CSTYLED */,
569 tnf_opaque
, vnode
, vp
,
570 tnf_ulong
, pages_pageout
, pgpgout
,
571 tnf_ulong
, pages_freed
, dfree
,
572 tnf_ulong
, pages_reclaimed
, pgrec
);
576 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
577 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
578 * operation and is only to be considered if it doesn't involve any
579 * waiting here. B_TRUNC indicates that the file is being truncated
580 * and so no i/o needs to be done. B_FORCE indicates that the page
581 * must be destroyed so don't try wrting it out.
583 * The caller must ensure that the page is locked. Returns 1, if
584 * the page should be written back (the "iolock" is held in this
585 * case), or 0 if the page has been dealt with or has been
589 pvn_getdirty(page_t
*pp
, int flags
)
591 ASSERT((flags
& (B_INVAL
| B_FREE
)) ?
592 PAGE_EXCL(pp
) : PAGE_SHARED(pp
));
593 ASSERT(PP_ISFREE(pp
) == 0);
596 * If trying to invalidate or free a logically `locked' page,
597 * forget it. Don't need page_struct_lock to check p_lckcnt and
598 * p_cowcnt as the page is exclusively locked.
600 if ((flags
& (B_INVAL
| B_FREE
)) && !(flags
& (B_TRUNC
|B_FORCE
)) &&
601 (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0)) {
607 * Now acquire the i/o lock so we can add it to the dirty
608 * list (if necessary). We avoid blocking on the i/o lock
609 * in the following cases:
611 * If B_DELWRI is set, which implies that this request is
612 * due to a klustering operartion.
614 * If this is an async (B_ASYNC) operation and we are not doing
615 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
616 * that the the page is written out].
618 if ((flags
& B_DELWRI
) || ((flags
& (B_INVAL
| B_ASYNC
)) == B_ASYNC
)) {
619 if (!page_io_trylock(pp
)) {
628 * If we want to free or invalidate the page then
629 * we need to unload it so that anyone who wants
630 * it will have to take a minor fault to get it.
631 * Otherwise, we're just writing the page back so we
632 * need to sync up the hardwre and software mod bit to
633 * detect any future modifications. We clear the
634 * software mod bit when we put the page on the dirty
637 if (flags
& (B_INVAL
| B_FREE
)) {
638 (void) hat_pageunload(pp
, HAT_FORCE_PGUNLOAD
);
640 (void) hat_pagesync(pp
, HAT_SYNC_ZERORM
);
643 if (!hat_ismod(pp
) || (flags
& B_TRUNC
)) {
645 * Don't need to add it to the
649 if (flags
& B_INVAL
) {
650 /*LINTED: constant in conditional context*/
651 VN_DISPOSE(pp
, B_INVAL
, 0, kcred
);
652 } else if (flags
& B_FREE
) {
653 /*LINTED: constant in conditional context*/
654 VN_DISPOSE(pp
, B_FREE
, (flags
& B_DONTNEED
), kcred
);
657 * This is advisory path for the callers
658 * of VOP_PUTPAGE() who prefer freeing the
659 * page _only_ if no one else is accessing it.
660 * E.g. segmap_release()
662 * The above hat_ismod() check is useless because:
663 * (1) we may not be holding SE_EXCL lock;
664 * (2) we've not unloaded _all_ translations
666 * Let page_release() do the heavy-lifting.
668 (void) page_release(pp
, 1);
674 * Page is dirty, get it ready for the write back
675 * and add page to the dirty list.
680 * If we're going to free the page when we're done
681 * then we can let others try to use it starting now.
682 * We'll detect the fact that they used it when the
683 * i/o is done and avoid freeing the page.
689 TRACE_1(TR_FAC_VM
, TR_PVN_GETDIRTY
, "pvn_getdirty:pp %p", pp
);
697 marker_constructor(void *buf
, void *cdrarg
, int kmflags
)
700 bzero(mark
, sizeof (page_t
));
701 mark
->p_hash
= PVN_VPLIST_HASH_TAG
;
708 if (pvn_vmodsort_disable
== 0)
709 pvn_vmodsort_supported
= hat_supported(HAT_VMODSORT
, NULL
);
710 marker_cache
= kmem_cache_create("marker_cache",
711 sizeof (page_t
), 0, marker_constructor
,
712 NULL
, NULL
, NULL
, NULL
, 0);
717 * Process a vnode's page list for all pages whose offset is >= off.
718 * Pages are to either be free'd, invalidated, or written back to disk.
720 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
721 * is specified, otherwise they are "shared" locked.
723 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
725 * Special marker page_t's are inserted in the list in order
726 * to keep track of where we are in the list when locks are dropped.
728 * Note the list is circular and insertions can happen only at the
729 * head and tail of the list. The algorithm ensures visiting all pages
730 * on the list in the following way:
732 * Drop two marker pages at the end of the list.
734 * Move one marker page backwards towards the start of the list until
735 * it is at the list head, processing the pages passed along the way.
737 * Due to race conditions when the vphm mutex is dropped, additional pages
738 * can be added to either end of the list, so we'll continue to move
739 * the marker and process pages until it is up against the end marker.
741 * There is one special exit condition. If we are processing a VMODSORT
742 * vnode and only writing back modified pages, we can stop as soon as
743 * we run into an unmodified page. This makes fsync(3) operations fast.
749 int (*putapage
)(vnode_t
*, page_t
*, u_offset_t
*,
750 size_t *, int, cred_t
*),
755 page_t
*mark
; /* marker page that moves toward head */
756 page_t
*end
; /* marker page at end of list */
761 page_t
**where_to_move
;
763 ASSERT(vp
->v_type
!= VCHR
);
765 if (vp
->v_pages
== NULL
)
770 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
772 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
773 * from getting blocked while flushing pages to a dead NFS server.
775 mutex_enter(&vp
->v_lock
);
776 if ((vp
->v_flag
& VVMLOCK
) && (flags
& B_ASYNC
)) {
777 mutex_exit(&vp
->v_lock
);
781 while (vp
->v_flag
& VVMLOCK
)
782 cv_wait(&vp
->v_cv
, &vp
->v_lock
);
784 if (vp
->v_pages
== NULL
) {
785 mutex_exit(&vp
->v_lock
);
789 vp
->v_flag
|= VVMLOCK
;
790 mutex_exit(&vp
->v_lock
);
794 * Set up the marker pages used to walk the list
796 end
= kmem_cache_alloc(marker_cache
, KM_SLEEP
);
798 end
->p_offset
= (u_offset_t
)-2;
799 mark
= kmem_cache_alloc(marker_cache
, KM_SLEEP
);
801 mark
->p_offset
= (u_offset_t
)-1;
804 * Grab the lock protecting the vnode's page list
805 * note that this lock is dropped at times in the loop.
807 vphm
= page_vnode_mutex(vp
);
809 if (vp
->v_pages
== NULL
)
813 * insert the markers and loop through the list of pages
815 page_vpadd(&vp
->v_pages
->p_vpprev
->p_vpnext
, mark
);
816 page_vpadd(&mark
->p_vpnext
, end
);
820 * If only doing an async write back, then we can
821 * stop as soon as we get to start of the list.
823 if (flags
== B_ASYNC
&& vp
->v_pages
== mark
)
827 * otherwise stop when we've gone through all the pages
829 if (mark
->p_vpprev
== end
)
833 if (vp
->v_pages
== pp
)
834 where_to_move
= &vp
->v_pages
;
836 where_to_move
= &pp
->p_vpprev
->p_vpnext
;
838 ASSERT(pp
->p_vnode
== vp
);
841 * If just flushing dirty pages to disk and this vnode
842 * is using a sorted list of pages, we can stop processing
843 * as soon as we find an unmodified page. Since all the
844 * modified pages are visited first.
846 if (IS_VMODSORT(vp
) &&
847 !(flags
& (B_INVAL
| B_FREE
| B_TRUNC
))) {
848 if (!hat_ismod(pp
) && !page_io_locked(pp
)) {
851 * For debug kernels examine what should be
852 * all the remaining clean pages, asserting
853 * that they are not modified.
858 page_vpsub(&vp
->v_pages
, mark
);
859 page_vpadd(where_to_move
, mark
);
865 attr
= hat_page_getattr(chk
, P_MOD
|
867 if ((attr
& P_MOD
) == 0)
869 panic("v_pages list not all clean: "
870 "page_t*=%p vnode=%p off=%lx "
871 "attr=0x%x last clean page_t*=%p\n",
872 (void *)chk
, (void *)chk
->p_vnode
,
873 (long)chk
->p_offset
, attr
,
875 } while (chk
!= vp
->v_pages
);
878 } else if (!(flags
& B_ASYNC
) && !hat_ismod(pp
)) {
880 * Couldn't get io lock, wait until IO is done.
881 * Block only for sync IO since we don't want
892 * Skip this page if the offset is out of the desired range.
893 * Just move the marker and continue.
895 if (pp
->p_offset
< off
) {
896 page_vpsub(&vp
->v_pages
, mark
);
897 page_vpadd(where_to_move
, mark
);
902 * If we are supposed to invalidate or free this
903 * page, then we need an exclusive lock.
905 se
= (flags
& (B_INVAL
| B_FREE
)) ? SE_EXCL
: SE_SHARED
;
908 * We must acquire the page lock for all synchronous
909 * operations (invalidate, free and write).
911 if ((flags
& B_INVAL
) != 0 || (flags
& B_ASYNC
) == 0) {
913 * If the page_lock() drops the mutex
914 * we must retry the loop.
916 if (!page_lock(pp
, se
, vphm
, P_NO_RECLAIM
))
920 * It's ok to move the marker page now.
922 page_vpsub(&vp
->v_pages
, mark
);
923 page_vpadd(where_to_move
, mark
);
927 * update the marker page for all remaining cases
929 page_vpsub(&vp
->v_pages
, mark
);
930 page_vpadd(where_to_move
, mark
);
933 * For write backs, If we can't lock the page, it's
934 * invalid or in the process of being destroyed. Skip
935 * it, assuming someone else is writing it.
937 if (!page_trylock(pp
, se
))
941 ASSERT(pp
->p_vnode
== vp
);
944 * Successfully locked the page, now figure out what to
945 * do with it. Free pages are easily dealt with, invalidate
946 * if desired or just go on to the next page.
949 if ((flags
& B_INVAL
) == 0) {
955 * Invalidate (destroy) the page.
958 page_destroy_free(pp
);
964 * pvn_getdirty() figures out what do do with a dirty page.
965 * If the page is dirty, the putapage() routine will write it
966 * and will kluster any other adjacent dirty pages it can.
968 * pvn_getdirty() and `(*putapage)' unlock the page.
971 if (pvn_getdirty(pp
, flags
)) {
972 error
= (*putapage
)(vp
, pp
, NULL
, NULL
, flags
, cred
);
978 page_vpsub(&vp
->v_pages
, mark
);
979 page_vpsub(&vp
->v_pages
, end
);
983 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
986 kmem_cache_free(marker_cache
, mark
);
987 kmem_cache_free(marker_cache
, end
);
988 mutex_enter(&vp
->v_lock
);
989 vp
->v_flag
&= ~VVMLOCK
;
990 cv_broadcast(&vp
->v_cv
);
991 mutex_exit(&vp
->v_lock
);
996 * Walk the vp->v_pages list, for every page call the callback function
997 * pointed by *page_check. If page_check returns non-zero, then mark the
998 * page as modified and if VMODSORT is set, move it to the end of v_pages
999 * list. Moving makes sense only if we have at least two pages - this also
1000 * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001 * if there was just one page.
1004 pvn_vplist_setdirty(vnode_t
*vp
, int (*page_check
)(page_t
*))
1006 page_t
*pp
, *next
, *end
;
1010 vphm
= page_vnode_mutex(vp
);
1013 if (vp
->v_pages
== NULL
) {
1018 end
= vp
->v_pages
->p_vpprev
;
1019 shuffle
= IS_VMODSORT(vp
) && (vp
->v_pages
!= end
);
1023 next
= pp
->p_vpnext
;
1024 if (pp
->p_hash
!= PVN_VPLIST_HASH_TAG
&& page_check(pp
)) {
1026 * hat_setmod_only() in contrast to hat_setmod() does
1027 * not shuffle the pages and does not grab the mutex
1028 * page_vnode_mutex. Exactly what we need.
1030 hat_setmod_only(pp
);
1032 page_vpsub(&vp
->v_pages
, pp
);
1033 ASSERT(vp
->v_pages
!= NULL
);
1034 page_vpadd(&vp
->v_pages
->p_vpprev
->p_vpnext
,
1038 /* Stop if we have just processed the last page. */
1048 * Zero out zbytes worth of data. Caller should be aware that this
1049 * routine may enter back into the fs layer (xxx_getpage). Locks
1050 * that the xxx_getpage routine may need should not be held while
1054 pvn_vpzero(struct vnode
*vp
, u_offset_t vplen
, size_t zbytes
)
1058 ASSERT(vp
->v_type
!= VCHR
);
1060 if (vp
->v_pages
== NULL
)
1064 * zbytes may be zero but there still may be some portion of
1065 * a page which needs clearing (since zbytes is a function
1066 * of filesystem block size, not pagesize.)
1068 if (zbytes
== 0 && (PAGESIZE
- (vplen
& PAGEOFFSET
)) == 0)
1072 * We get the last page and handle the partial
1073 * zeroing via kernel mappings. This will make the page
1074 * dirty so that we know that when this page is written
1075 * back, the zeroed information will go out with it. If
1076 * the page is not currently in memory, then the kzero
1077 * operation will cause it to be brought it. We use kzero
1078 * instead of bzero so that if the page cannot be read in
1079 * for any reason, the system will not panic. We need
1080 * to zero out a minimum of the fs given zbytes, but we
1081 * might also have to do more to get the entire last page.
1084 if ((zbytes
+ (vplen
& MAXBOFFSET
)) > MAXBSIZE
)
1085 panic("pvn_vptrunc zbytes");
1086 addr
= segmap_getmapflt(segkmap
, vp
, vplen
,
1087 MAX(zbytes
, PAGESIZE
- (vplen
& PAGEOFFSET
)), 1, S_WRITE
);
1088 (void) kzero(addr
+ (vplen
& MAXBOFFSET
),
1089 MAX(zbytes
, PAGESIZE
- (vplen
& PAGEOFFSET
)));
1090 (void) segmap_release(segkmap
, addr
, SM_WRITE
| SM_ASYNC
);
1094 * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 * calling the getpage helper for each.
1099 int (*getpage
)(vnode_t
*, u_offset_t
, size_t, uint_t
*, page_t
*[],
1100 size_t, struct seg
*, caddr_t
, enum seg_rw
, cred_t
*),
1117 /* ensure that we have enough space */
1118 ASSERT(pl
== NULL
|| plsz
>= len
);
1121 * Loop one page at a time and let getapage function fill
1122 * in the next page in array. We only allow one page to be
1123 * returned at a time (except for the last page) so that we
1124 * don't have any problems with duplicates and other such
1125 * painful problems. This is a very simple minded algorithm,
1126 * but it does the job correctly. We hope that the cost of a
1127 * getapage call for a resident page that we might have been
1128 * able to get from an earlier call doesn't cost too much.
1131 sz
= (pl
!= NULL
) ? PAGESIZE
: 0;
1134 for (o
= off
; o
< eoff
; o
+= PAGESIZE
, addr
+= PAGESIZE
,
1136 if (o
+ PAGESIZE
>= eoff
&& pl
!= NULL
) {
1138 * Last time through - allow the all of
1139 * what's left of the pl[] array to be used.
1141 sz
= plsz
- (o
- off
);
1143 err
= (*getpage
)(vp
, o
, xlen
, protp
, ppp
, sz
, seg
, addr
,
1147 * Release any pages we already got.
1149 if (o
> off
&& pl
!= NULL
) {
1150 for (ppp
= pl
; *ppp
!= NULL
; *ppp
++ = NULL
)
1151 (void) page_release(*ppp
, 1);
1162 * Initialize the page list array.
1166 pvn_plist_init(page_t
*pp
, page_t
*pl
[], size_t plsz
,
1167 u_offset_t off
, size_t io_len
, enum seg_rw rw
)
1170 page_t
*ppcur
, **ppp
;
1173 * Set up to load plsz worth
1174 * starting at the needed page.
1176 while (pp
!= NULL
&& pp
->p_offset
!= off
) {
1178 * Remove page from the i/o list,
1179 * release the i/o and the page lock.
1182 page_sub(&pp
, ppcur
);
1183 page_io_unlock(ppcur
);
1184 (void) page_release(ppcur
, 1);
1195 * Initialize the page list array.
1201 page_sub(&pp
, ppcur
);
1202 page_io_unlock(ppcur
);
1204 page_downgrade(ppcur
);
1206 } while (sz
> 0 && pp
!= NULL
);
1207 *ppp
= NULL
; /* terminate list */
1210 * Now free the remaining pages that weren't
1211 * loaded in the page list.
1213 while (pp
!= NULL
) {
1215 page_sub(&pp
, ppcur
);
1216 page_io_unlock(ppcur
);
1217 (void) page_release(ppcur
, 1);