4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
34 #include <sys/cmn_err.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
43 #include <sys/fs/swapnode.h>
48 #include <sys/fs_subr.h>
50 #include <vm/seg_kp.h>
53 * Define the routines within this file.
55 static int swap_getpage(struct vnode
*vp
, offset_t off
, size_t len
,
56 uint_t
*protp
, struct page
**plarr
, size_t plsz
, struct seg
*seg
,
57 caddr_t addr
, enum seg_rw rw
, struct cred
*cr
, caller_context_t
*ct
);
58 static int swap_putpage(struct vnode
*vp
, offset_t off
, size_t len
,
59 int flags
, struct cred
*cr
, caller_context_t
*ct
);
60 static void swap_inactive(struct vnode
*vp
, struct cred
*cr
,
61 caller_context_t
*ct
);
62 static void swap_dispose(vnode_t
*vp
, page_t
*pp
, int fl
, int dn
,
63 cred_t
*cr
, caller_context_t
*ct
);
65 static int swap_getapage(struct vnode
*vp
, uoff_t off
, size_t len
,
66 uint_t
*protp
, page_t
**plarr
, size_t plsz
,
67 struct seg
*seg
, caddr_t addr
, enum seg_rw rw
, struct cred
*cr
);
69 int swap_getconpage(struct vnode
*vp
, uoff_t off
, size_t len
,
70 uint_t
*protp
, page_t
**plarr
, size_t plsz
, page_t
*conpp
,
71 uint_t
*pszc
, spgcnt_t
*nreloc
, struct seg
*seg
, caddr_t addr
,
72 enum seg_rw rw
, struct cred
*cr
);
74 static int swap_putapage(struct vnode
*vp
, page_t
*pp
, uoff_t
*off
,
75 size_t *lenp
, int flags
, struct cred
*cr
);
77 const struct vnodeops swap_vnodeops
= {
78 .vnop_name
= "swapfs",
79 .vop_inactive
= swap_inactive
,
80 .vop_getpage
= swap_getpage
,
81 .vop_putpage
= swap_putpage
,
82 .vop_dispose
= swap_dispose
,
83 .vop_setfl
= fs_nosys
,
84 .vop_poll
= (void *) fs_nosys
,
85 .vop_pathconf
= fs_nosys
,
86 .vop_getsecattr
= fs_nosys
,
87 .vop_shrlock
= fs_nosys
,
97 SWAPFS_PRINT(SWAP_VOPS
, "swap_inactive: vp %x\n", vp
, 0, 0, 0, 0);
101 * Return all the pages from [off..off+len] in given file
116 caller_context_t
*ct
)
118 SWAPFS_PRINT(SWAP_VOPS
, "swap_getpage: vp %p, off %llx, len %lx\n",
119 (void *)vp
, off
, len
, 0, 0);
121 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETPAGE
,
122 "swapfs getpage:vp %p off %llx len %ld",
123 (void *)vp
, off
, len
);
125 return (pvn_getpages(swap_getapage
, vp
, (uoff_t
)off
, len
, protp
,
126 pl
, plsz
, seg
, addr
, rw
, cr
));
130 * Called from pvn_getpages to get a particular page.
146 struct page
*pp
, *rpp
;
149 struct vnode
*pvp
= NULL
;
155 SWAPFS_PRINT(SWAP_VOPS
, "swap_getapage: vp %p, off %llx, len %lx\n",
159 * Until there is a call-back mechanism to cause SEGKP
160 * pages to be unlocked, make them non-relocatable.
162 if (SEG_IS_SEGKP(seg
))
163 flag_noreloc
= PG_NORELOC
;
170 lock
= (rw
== S_CREATE
? SE_EXCL
: SE_SHARED
);
173 if (pp
= page_lookup(&vp
->v_object
, off
, lock
)) {
184 pp
= page_create_va(&vp
->v_object
, off
, PAGESIZE
,
185 PG_WAIT
| PG_EXCL
| flag_noreloc
,
188 * Someone raced in and created the page after we did the
189 * lookup but before we did the create, so go back and
190 * try to look it up again.
194 if (rw
!= S_CREATE
) {
195 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
200 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
201 err
= fop_pageio(pvp
, pp
, poff
,
202 PAGESIZE
, flags
, cr
, NULL
);
205 ahm
= AH_MUTEX(vp
, off
);
208 ap
= swap_anon(vp
, off
);
210 panic("swap_getapage:"
214 if (ap
->an_pvp
== pvp
&&
215 ap
->an_poff
== poff
) {
216 swap_phys_free(pvp
, poff
,
227 pagezero(pp
, 0, PAGESIZE
);
230 * If it's a fault ahead, release page_io_lock
231 * and SE_EXCL we grabbed in page_create_va
233 * If we are here, we haven't called fop_pageio
234 * and thus calling pvn_read_done(pp, B_READ)
235 * below may mislead that we tried i/o. Besides,
236 * in case of async, pvn_read_done() should
237 * not be called by *getpage()
241 * swap_getphysname can return error
242 * only when we are getting called from
243 * swapslot_free which passes non-NULL
256 pvn_read_done(pp
, B_ERROR
);
259 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
261 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
262 "swapfs getapage:pp %p vp %p off %llx", pp
, vp
, off
);
267 * Called from large page anon routines only! This is an ugly hack where
268 * the anon layer directly calls into swapfs with a preallocated large page.
269 * Another method would have been to change to VOP and add an extra arg for
270 * the preallocated large page. This all could be cleaned up later when we
271 * solve the anonymous naming problem and no longer need to loop across of
272 * the VOP in PAGESIZE increments to fill in or initialize a large page as
273 * is done today. I think the latter is better since it avoid a change to
274 * the VOP interface that could later be avoided.
294 struct vnode
*pvp
= NULL
;
297 ASSERT(len
== PAGESIZE
);
299 ASSERT(plsz
== PAGESIZE
);
300 ASSERT(protp
== NULL
);
301 ASSERT(nreloc
!= NULL
);
302 ASSERT(!SEG_IS_SEGKP(seg
)); /* XXX for now not supported */
303 SWAPFS_PRINT(SWAP_VOPS
, "swap_getconpage: vp %p, off %llx, len %lx\n",
307 * If we are not using a preallocated page then we know one already
308 * exists. So just let the old code handle it.
311 err
= swap_getapage(vp
, (uoff_t
)off
, len
, protp
, pl
, plsz
,
315 ASSERT(conpp
->p_szc
!= 0);
316 ASSERT(PAGE_EXCL(conpp
));
319 ASSERT(conpp
->p_next
== conpp
);
320 ASSERT(conpp
->p_prev
== conpp
);
321 ASSERT(!PP_ISAGED(conpp
));
322 ASSERT(!PP_ISFREE(conpp
));
325 pp
= page_lookup_create(&vp
->v_object
, off
, SE_SHARED
, conpp
, nreloc
,
329 * If existing page is found we may need to relocate.
332 ASSERT(rw
!= S_CREATE
);
333 ASSERT(pszc
!= NULL
);
334 ASSERT(PAGE_SHARED(pp
));
335 if (pp
->p_szc
< conpp
->p_szc
) {
339 } else if (pp
->p_szc
> conpp
->p_szc
&&
340 seg
->s_szc
> conpp
->p_szc
) {
341 *pszc
= MIN(pp
->p_szc
, seg
->s_szc
);
347 if (page_pptonum(pp
) &
348 (page_get_pagecnt(conpp
->p_szc
) - 1))
349 cmn_err(CE_PANIC
, "swap_getconpage: no root");
354 ASSERT(PAGE_EXCL(pp
));
357 ASSERT(rw
!= S_CREATE
);
366 * If necessary do the page io.
368 if (rw
!= S_CREATE
) {
370 * Since we are only called now on behalf of an
371 * address space operation it's impossible for
372 * us to fail unlike swap_getapge() which
373 * also gets called from swapslot_free().
375 if (swap_getphysname(vp
, off
, &pvp
, &poff
)) {
377 "swap_getconpage: swap_getphysname failed!");
381 err
= fop_pageio(pvp
, pp
, poff
, PAGESIZE
, B_READ
,
387 ahm
= AH_MUTEX(vp
, off
);
389 ap
= swap_anon(vp
, off
);
391 panic("swap_getconpage: null anon");
392 if (ap
->an_pvp
!= pvp
|| ap
->an_poff
!= poff
)
393 panic("swap_getconpage: bad anon");
395 swap_phys_free(pvp
, poff
, PAGESIZE
);
402 pagezero(pp
, 0, PAGESIZE
);
407 * Normally we would let pvn_read_done() destroy
408 * the page on IO error. But since this is a preallocated
409 * page we'll let the anon layer handle it.
413 page_hashout(pp
, false);
414 ASSERT(pp
->p_next
== pp
);
415 ASSERT(pp
->p_prev
== pp
);
417 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
418 "swapfs getconpage:pp %p vp %p off %llx", pp
, vp
, off
);
425 /* Async putpage klustering stuff */
427 extern int klustsize
;
428 extern struct async_reqs
*sw_getreq();
429 extern void sw_putreq(struct async_reqs
*);
430 extern void sw_putbackreq(struct async_reqs
*);
431 extern struct async_reqs
*sw_getfree();
432 extern void sw_putfree(struct async_reqs
*);
434 static size_t swap_putpagecnt
, swap_pagespushed
;
435 static size_t swap_otherfail
, swap_otherpages
;
436 static size_t swap_klustfail
, swap_klustpages
;
437 static size_t swap_getiofail
, swap_getiopages
;
440 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
441 * If len == 0, do from off to EOF.
443 static int swap_nopage
= 0; /* Don't do swap_putpage's if set */
453 caller_context_t
*ct
)
460 struct async_reqs
*arg
;
465 ASSERT(vp
->v_count
!= 0);
467 nowait
= flags
& B_PAGE_NOWAIT
;
470 * Clear force flag so that p_lckcnt pages are not invalidated.
472 flags
&= ~(B_FORCE
| B_PAGE_NOWAIT
);
474 SWAPFS_PRINT(SWAP_VOPS
,
475 "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
476 (void *)vp
, off
, len
, flags
, 0);
477 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_PUTPAGE
,
478 "swapfs putpage:vp %p off %llx len %ld", (void *)vp
, off
, len
);
480 if (vp
->v_flag
& VNOMAP
)
483 if (!vn_has_cached_data(vp
))
487 if (curproc
== proc_pageout
)
488 cmn_err(CE_PANIC
, "swapfs: pageout can't block");
490 /* Search the entire vp list for pages >= off. */
491 err
= pvn_vplist_dirty(vp
, (uoff_t
)off
, swap_putapage
,
497 * Loop over all offsets in the range [off...off + len]
498 * looking for pages to deal with.
501 for (io_off
= (uoff_t
)off
; io_off
< eoff
;
504 * If we run out of the async req slot, put the page
505 * now instead of queuing.
507 if (flags
== (B_ASYNC
| B_FREE
) &&
508 sw_pending_size
< klustsize
&&
509 (arg
= sw_getfree())) {
511 * If we are clustering, we should allow
512 * pageout to feed us more pages because # of
513 * pushes is limited by # of I/Os, and one
514 * cluster is considered to be one I/O.
521 arg
->a_len
= PAGESIZE
;
522 arg
->a_flags
= B_ASYNC
| B_FREE
;
529 * If we are not invalidating pages, use the
530 * routine page_lookup_nowait() to prevent
531 * reclaiming them from the free list.
533 if (!nowait
&& ((flags
& B_INVAL
) ||
534 (flags
& (B_ASYNC
| B_FREE
)) == B_FREE
))
535 pp
= page_lookup(&vp
->v_object
, io_off
,
538 pp
= page_lookup_nowait(&vp
->v_object
,
540 (flags
& (B_FREE
| B_INVAL
)) ? SE_EXCL
: SE_SHARED
);
542 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
545 err
= swap_putapage(vp
, pp
, &io_off
, &io_len
,
552 /* If invalidating, verify all pages on vnode list are gone. */
553 if (err
== 0 && off
== 0 && len
== 0 &&
554 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
556 "swap_putpage: B_INVAL, pages not gone");
562 * Write out a single page.
563 * For swapfs this means choose a physical swap slot and write the page
564 * out using fop_pageio.
565 * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
566 * swapfs pages, a bunch of contiguous swap slots and then write them
567 * all out in one clustered i/o.
586 struct vnode
*klvp
= NULL
;
589 struct async_reqs
*arg
;
590 size_t swap_klustsize
;
593 * This check is added for callers who access swap_putpage with len = 0.
594 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
595 * And it's necessary to do the same queuing if users have the same
596 * B_ASYNC|B_FREE flags on.
598 if (flags
== (B_ASYNC
| B_FREE
) &&
599 sw_pending_size
< klustsize
&& (arg
= sw_getfree())) {
606 arg
->a_off
= pp
->p_offset
;
607 arg
->a_len
= PAGESIZE
;
608 arg
->a_flags
= B_ASYNC
| B_FREE
;
615 SWAPFS_PRINT(SWAP_PUTP
,
616 "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
617 pp
, vp
, pp
->p_offset
, flags
, 0);
619 ASSERT(PAGE_LOCKED(pp
));
626 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
, &pvp
, &poff
)) {
627 err
= (flags
== (B_ASYNC
| B_FREE
) ? ENOMEM
: 0);
638 * If this is ASYNC | FREE and we've accumulated a bunch of such
639 * pending requests, kluster.
641 if (flags
== (B_ASYNC
| B_FREE
))
642 swap_klustsize
= klustsize
;
644 swap_klustsize
= PAGESIZE
;
645 se
= (flags
& B_FREE
? SE_EXCL
: SE_SHARED
);
647 while (klsz
< swap_klustsize
) {
648 if ((arg
= sw_getreq()) == NULL
) {
650 swap_getiopages
+= btop(klsz
);
653 ASSERT(vn_matchops(arg
->a_vp
, &swap_vnodeops
));
657 if ((pp
= page_lookup_nowait(&vp
->v_object
, off
, se
)) == NULL
) {
659 swap_otherpages
+= btop(klsz
);
663 if (pvn_getdirty(pp
, flags
| B_DELWRI
) == 0) {
667 /* Get new physical backing store for the page */
670 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
,
673 swap_otherpages
+= btop(klsz
);
680 /* Try to cluster new physical name with previous ones */
681 if (klvp
== pvp
&& poff
== klstart
+ klsz
) {
683 page_add(&pplist
, pp
);
684 pplist
= pplist
->p_next
;
686 } else if (klvp
== pvp
&& poff
== klstart
- PAGESIZE
) {
689 page_add(&pplist
, pp
);
693 swap_klustpages
+= btop(klsz
);
702 err
= fop_pageio(klvp
, pplist
, klstart
, klsz
,
703 B_WRITE
| flags
, cr
, NULL
);
705 if ((flags
& B_ASYNC
) == 0)
706 pvn_write_done(pp
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
711 swap_pagespushed
+= btop(klsz
);
714 TRACE_4(TR_FAC_SWAPFS
, TR_SWAPFS_PUTAPAGE
,
715 "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
716 vp
, klvp
, klstart
, klsz
);
717 if (err
&& err
!= ENOMEM
)
718 cmn_err(CE_WARN
, "swapfs_putapage: err %d\n", err
);
731 caller_context_t
*ct
)
734 uoff_t off
= pp
->p_offset
;
738 ASSERT(PAGE_EXCL(pp
));
741 * The caller will free/invalidate large page in one shot instead of
742 * one small page at a time.
744 if (pp
->p_szc
!= 0) {
749 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
750 if (!err
&& pvp
!= NULL
)
751 fop_dispose(pvp
, pp
, fl
, dn
, cr
, ct
);
753 fs_dispose(vp
, pp
, fl
, dn
, cr
, ct
);