4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs_opreg.h>
34 #include <sys/cmn_err.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
43 #include <sys/fs/swapnode.h>
48 #include <fs/fs_subr.h>
50 #include <vm/seg_kp.h>
53 * Define the routines within this file.
55 static int swap_getpage(struct vnode
*vp
, offset_t off
, size_t len
,
56 uint_t
*protp
, struct page
**plarr
, size_t plsz
, struct seg
*seg
,
57 caddr_t addr
, enum seg_rw rw
, struct cred
*cr
, caller_context_t
*ct
);
58 static int swap_putpage(struct vnode
*vp
, offset_t off
, size_t len
,
59 int flags
, struct cred
*cr
, caller_context_t
*ct
);
60 static void swap_inactive(struct vnode
*vp
, struct cred
*cr
,
61 caller_context_t
*ct
);
62 static void swap_dispose(vnode_t
*vp
, page_t
*pp
, int fl
, int dn
,
63 cred_t
*cr
, caller_context_t
*ct
);
65 static int swap_getapage(struct vnode
*vp
, u_offset_t off
, size_t len
,
66 uint_t
*protp
, page_t
**plarr
, size_t plsz
,
67 struct seg
*seg
, caddr_t addr
, enum seg_rw rw
, struct cred
*cr
);
69 int swap_getconpage(struct vnode
*vp
, u_offset_t off
, size_t len
,
70 uint_t
*protp
, page_t
**plarr
, size_t plsz
, page_t
*conpp
,
71 uint_t
*pszc
, spgcnt_t
*nreloc
, struct seg
*seg
, caddr_t addr
,
72 enum seg_rw rw
, struct cred
*cr
);
74 static int swap_putapage(struct vnode
*vp
, page_t
*pp
, u_offset_t
*off
,
75 size_t *lenp
, int flags
, struct cred
*cr
);
77 const fs_operation_def_t swap_vnodeops_template
[] = {
78 VOPNAME_INACTIVE
, { .vop_inactive
= swap_inactive
},
79 VOPNAME_GETPAGE
, { .vop_getpage
= swap_getpage
},
80 VOPNAME_PUTPAGE
, { .vop_putpage
= swap_putpage
},
81 VOPNAME_DISPOSE
, { .vop_dispose
= swap_dispose
},
82 VOPNAME_SETFL
, { .error
= fs_error
},
83 VOPNAME_POLL
, { .error
= fs_error
},
84 VOPNAME_PATHCONF
, { .error
= fs_error
},
85 VOPNAME_GETSECATTR
, { .error
= fs_error
},
86 VOPNAME_SHRLOCK
, { .error
= fs_error
},
90 vnodeops_t
*swap_vnodeops
;
99 SWAPFS_PRINT(SWAP_VOPS
, "swap_inactive: vp %x\n", vp
, 0, 0, 0, 0);
103 * Return all the pages from [off..off+len] in given file
118 caller_context_t
*ct
)
120 SWAPFS_PRINT(SWAP_VOPS
, "swap_getpage: vp %p, off %llx, len %lx\n",
121 (void *)vp
, off
, len
, 0, 0);
123 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETPAGE
,
124 "swapfs getpage:vp %p off %llx len %ld",
125 (void *)vp
, off
, len
);
127 return (pvn_getpages(swap_getapage
, vp
, (u_offset_t
)off
, len
, protp
,
128 pl
, plsz
, seg
, addr
, rw
, cr
));
132 * Called from pvn_getpages to get a particular page.
148 struct page
*pp
, *rpp
;
151 struct vnode
*pvp
= NULL
;
158 SWAPFS_PRINT(SWAP_VOPS
, "swap_getapage: vp %p, off %llx, len %lx\n",
162 * Until there is a call-back mechanism to cause SEGKP
163 * pages to be unlocked, make them non-relocatable.
165 if (SEG_IS_SEGKP(seg
))
166 flag_noreloc
= PG_NORELOC
;
173 lock
= (rw
== S_CREATE
? SE_EXCL
: SE_SHARED
);
176 if (pp
= page_lookup(vp
, off
, lock
)) {
178 * In very rare instances, a segkp page may have been
179 * relocated outside of the kernel by the kernel cage
180 * due to the window between page_unlock() and
181 * VOP_PUTPAGE() in segkp_unlock(). Due to the
182 * rareness of these occurances, the solution is to
183 * relocate the page to a P_NORELOC page.
185 if (flag_noreloc
!= 0) {
186 if (!PP_ISNORELOC(pp
) && kcage_on
) {
187 if (lock
!= SE_EXCL
) {
189 if (!page_tryupgrade(pp
)) {
196 if (page_relocate_cage(&pp
, &rpp
) != 0)
197 panic("swap_getapage: "
198 "page_relocate_cage failed");
214 pp
= page_create_va(vp
, off
, PAGESIZE
,
215 PG_WAIT
| PG_EXCL
| flag_noreloc
,
218 * Someone raced in and created the page after we did the
219 * lookup but before we did the create, so go back and
220 * try to look it up again.
224 if (rw
!= S_CREATE
) {
225 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
230 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
231 err
= VOP_PAGEIO(pvp
, pp
, poff
,
232 PAGESIZE
, flags
, cr
, NULL
);
235 ahm
= AH_MUTEX(vp
, off
);
238 ap
= swap_anon(vp
, off
);
240 panic("swap_getapage:"
244 if (ap
->an_pvp
== pvp
&&
245 ap
->an_poff
== poff
) {
246 swap_phys_free(pvp
, poff
,
257 pagezero(pp
, 0, PAGESIZE
);
260 * If it's a fault ahead, release page_io_lock
261 * and SE_EXCL we grabbed in page_create_va
263 * If we are here, we haven't called VOP_PAGEIO
264 * and thus calling pvn_read_done(pp, B_READ)
265 * below may mislead that we tried i/o. Besides,
266 * in case of async, pvn_read_done() should
267 * not be called by *getpage()
271 * swap_getphysname can return error
272 * only when we are getting called from
273 * swapslot_free which passes non-NULL
286 pvn_read_done(pp
, B_ERROR
);
289 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
291 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
292 "swapfs getapage:pp %p vp %p off %llx", pp
, vp
, off
);
297 * Called from large page anon routines only! This is an ugly hack where
298 * the anon layer directly calls into swapfs with a preallocated large page.
299 * Another method would have been to change to VOP and add an extra arg for
300 * the preallocated large page. This all could be cleaned up later when we
301 * solve the anonymous naming problem and no longer need to loop across of
302 * the VOP in PAGESIZE increments to fill in or initialize a large page as
303 * is done today. I think the latter is better since it avoid a change to
304 * the VOP interface that could later be avoided.
324 struct vnode
*pvp
= NULL
;
327 ASSERT(len
== PAGESIZE
);
329 ASSERT(plsz
== PAGESIZE
);
330 ASSERT(protp
== NULL
);
331 ASSERT(nreloc
!= NULL
);
332 ASSERT(!SEG_IS_SEGKP(seg
)); /* XXX for now not supported */
333 SWAPFS_PRINT(SWAP_VOPS
, "swap_getconpage: vp %p, off %llx, len %lx\n",
337 * If we are not using a preallocated page then we know one already
338 * exists. So just let the old code handle it.
341 err
= swap_getapage(vp
, (u_offset_t
)off
, len
, protp
, pl
, plsz
,
345 ASSERT(conpp
->p_szc
!= 0);
346 ASSERT(PAGE_EXCL(conpp
));
349 ASSERT(conpp
->p_next
== conpp
);
350 ASSERT(conpp
->p_prev
== conpp
);
351 ASSERT(!PP_ISAGED(conpp
));
352 ASSERT(!PP_ISFREE(conpp
));
355 pp
= page_lookup_create(vp
, off
, SE_SHARED
, conpp
, nreloc
, 0);
358 * If existing page is found we may need to relocate.
361 ASSERT(rw
!= S_CREATE
);
362 ASSERT(pszc
!= NULL
);
363 ASSERT(PAGE_SHARED(pp
));
364 if (pp
->p_szc
< conpp
->p_szc
) {
368 } else if (pp
->p_szc
> conpp
->p_szc
&&
369 seg
->s_szc
> conpp
->p_szc
) {
370 *pszc
= MIN(pp
->p_szc
, seg
->s_szc
);
376 if (page_pptonum(pp
) &
377 (page_get_pagecnt(conpp
->p_szc
) - 1))
378 cmn_err(CE_PANIC
, "swap_getconpage: no root");
383 ASSERT(PAGE_EXCL(pp
));
386 ASSERT(rw
!= S_CREATE
);
395 * If necessary do the page io.
397 if (rw
!= S_CREATE
) {
399 * Since we are only called now on behalf of an
400 * address space operation it's impossible for
401 * us to fail unlike swap_getapge() which
402 * also gets called from swapslot_free().
404 if (swap_getphysname(vp
, off
, &pvp
, &poff
)) {
406 "swap_getconpage: swap_getphysname failed!");
410 err
= VOP_PAGEIO(pvp
, pp
, poff
, PAGESIZE
, B_READ
,
416 ahm
= AH_MUTEX(vp
, off
);
418 ap
= swap_anon(vp
, off
);
420 panic("swap_getconpage: null anon");
421 if (ap
->an_pvp
!= pvp
|| ap
->an_poff
!= poff
)
422 panic("swap_getconpage: bad anon");
424 swap_phys_free(pvp
, poff
, PAGESIZE
);
431 pagezero(pp
, 0, PAGESIZE
);
436 * Normally we would let pvn_read_done() destroy
437 * the page on IO error. But since this is a preallocated
438 * page we'll let the anon layer handle it.
442 page_hashout(pp
, NULL
);
443 ASSERT(pp
->p_next
== pp
);
444 ASSERT(pp
->p_prev
== pp
);
446 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
447 "swapfs getconpage:pp %p vp %p off %llx", pp
, vp
, off
);
454 /* Async putpage klustering stuff */
456 extern int klustsize
;
457 extern struct async_reqs
*sw_getreq();
458 extern void sw_putreq(struct async_reqs
*);
459 extern void sw_putbackreq(struct async_reqs
*);
460 extern struct async_reqs
*sw_getfree();
461 extern void sw_putfree(struct async_reqs
*);
463 static size_t swap_putpagecnt
, swap_pagespushed
;
464 static size_t swap_otherfail
, swap_otherpages
;
465 static size_t swap_klustfail
, swap_klustpages
;
466 static size_t swap_getiofail
, swap_getiopages
;
469 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
470 * If len == 0, do from off to EOF.
472 static int swap_nopage
= 0; /* Don't do swap_putpage's if set */
482 caller_context_t
*ct
)
489 struct async_reqs
*arg
;
494 ASSERT(vp
->v_count
!= 0);
496 nowait
= flags
& B_PAGE_NOWAIT
;
499 * Clear force flag so that p_lckcnt pages are not invalidated.
501 flags
&= ~(B_FORCE
| B_PAGE_NOWAIT
);
503 SWAPFS_PRINT(SWAP_VOPS
,
504 "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
505 (void *)vp
, off
, len
, flags
, 0);
506 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_PUTPAGE
,
507 "swapfs putpage:vp %p off %llx len %ld", (void *)vp
, off
, len
);
509 if (vp
->v_flag
& VNOMAP
)
512 if (!vn_has_cached_data(vp
))
516 if (curproc
== proc_pageout
)
517 cmn_err(CE_PANIC
, "swapfs: pageout can't block");
519 /* Search the entire vp list for pages >= off. */
520 err
= pvn_vplist_dirty(vp
, (u_offset_t
)off
, swap_putapage
,
526 * Loop over all offsets in the range [off...off + len]
527 * looking for pages to deal with.
530 for (io_off
= (u_offset_t
)off
; io_off
< eoff
;
533 * If we run out of the async req slot, put the page
534 * now instead of queuing.
536 if (flags
== (B_ASYNC
| B_FREE
) &&
537 sw_pending_size
< klustsize
&&
538 (arg
= sw_getfree())) {
540 * If we are clustering, we should allow
541 * pageout to feed us more pages because # of
542 * pushes is limited by # of I/Os, and one
543 * cluster is considered to be one I/O.
550 arg
->a_len
= PAGESIZE
;
551 arg
->a_flags
= B_ASYNC
| B_FREE
;
558 * If we are not invalidating pages, use the
559 * routine page_lookup_nowait() to prevent
560 * reclaiming them from the free list.
562 if (!nowait
&& ((flags
& B_INVAL
) ||
563 (flags
& (B_ASYNC
| B_FREE
)) == B_FREE
))
564 pp
= page_lookup(vp
, io_off
, SE_EXCL
);
566 pp
= page_lookup_nowait(vp
, io_off
,
567 (flags
& (B_FREE
| B_INVAL
)) ?
568 SE_EXCL
: SE_SHARED
);
570 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
573 err
= swap_putapage(vp
, pp
, &io_off
, &io_len
,
580 /* If invalidating, verify all pages on vnode list are gone. */
581 if (err
== 0 && off
== 0 && len
== 0 &&
582 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
584 "swap_putpage: B_INVAL, pages not gone");
590 * Write out a single page.
591 * For swapfs this means choose a physical swap slot and write the page
592 * out using VOP_PAGEIO.
593 * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
594 * swapfs pages, a bunch of contiguous swap slots and then write them
595 * all out in one clustered i/o.
609 u_offset_t poff
, off
;
613 u_offset_t klstart
= 0;
614 struct vnode
*klvp
= NULL
;
617 struct async_reqs
*arg
;
618 size_t swap_klustsize
;
621 * This check is added for callers who access swap_putpage with len = 0.
622 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
623 * And it's necessary to do the same queuing if users have the same
624 * B_ASYNC|B_FREE flags on.
626 if (flags
== (B_ASYNC
| B_FREE
) &&
627 sw_pending_size
< klustsize
&& (arg
= sw_getfree())) {
634 arg
->a_off
= pp
->p_offset
;
635 arg
->a_len
= PAGESIZE
;
636 arg
->a_flags
= B_ASYNC
| B_FREE
;
643 SWAPFS_PRINT(SWAP_PUTP
,
644 "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
645 pp
, vp
, pp
->p_offset
, flags
, 0);
647 ASSERT(PAGE_LOCKED(pp
));
654 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
, &pvp
, &poff
)) {
655 err
= (flags
== (B_ASYNC
| B_FREE
) ? ENOMEM
: 0);
666 * If this is ASYNC | FREE and we've accumulated a bunch of such
667 * pending requests, kluster.
669 if (flags
== (B_ASYNC
| B_FREE
))
670 swap_klustsize
= klustsize
;
672 swap_klustsize
= PAGESIZE
;
673 se
= (flags
& B_FREE
? SE_EXCL
: SE_SHARED
);
675 while (klsz
< swap_klustsize
) {
676 if ((arg
= sw_getreq()) == NULL
) {
678 swap_getiopages
+= btop(klsz
);
681 ASSERT(vn_matchops(arg
->a_vp
, swap_vnodeops
));
685 if ((pp
= page_lookup_nowait(vp
, off
, se
)) == NULL
) {
687 swap_otherpages
+= btop(klsz
);
691 if (pvn_getdirty(pp
, flags
| B_DELWRI
) == 0) {
695 /* Get new physical backing store for the page */
698 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
,
701 swap_otherpages
+= btop(klsz
);
708 /* Try to cluster new physical name with previous ones */
709 if (klvp
== pvp
&& poff
== klstart
+ klsz
) {
711 page_add(&pplist
, pp
);
712 pplist
= pplist
->p_next
;
714 } else if (klvp
== pvp
&& poff
== klstart
- PAGESIZE
) {
717 page_add(&pplist
, pp
);
721 swap_klustpages
+= btop(klsz
);
730 err
= VOP_PAGEIO(klvp
, pplist
, klstart
, klsz
,
731 B_WRITE
| flags
, cr
, NULL
);
733 if ((flags
& B_ASYNC
) == 0)
734 pvn_write_done(pp
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
739 swap_pagespushed
+= btop(klsz
);
742 TRACE_4(TR_FAC_SWAPFS
, TR_SWAPFS_PUTAPAGE
,
743 "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
744 vp
, klvp
, klstart
, klsz
);
745 if (err
&& err
!= ENOMEM
)
746 cmn_err(CE_WARN
, "swapfs_putapage: err %d\n", err
);
759 caller_context_t
*ct
)
762 u_offset_t off
= pp
->p_offset
;
766 ASSERT(PAGE_EXCL(pp
));
769 * The caller will free/invalidate large page in one shot instead of
770 * one small page at a time.
772 if (pp
->p_szc
!= 0) {
777 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
778 if (!err
&& pvp
!= NULL
)
779 VOP_DISPOSE(pvp
, pp
, fl
, dn
, cr
, ct
);
781 fs_dispose(vp
, pp
, fl
, dn
, cr
, ct
);