nightly: use make -de
[unleashed.git] / kernel / fs / swapfs / swap_vnops.c
blob549b4837a379a136808e007640432c758af834c2
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/buf.h>
30 #include <sys/cred.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
33 #include <sys/vfs.h>
34 #include <sys/cmn_err.h>
35 #include <sys/swap.h>
36 #include <sys/mman.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
41 #include <sys/vm.h>
43 #include <sys/fs/swapnode.h>
45 #include <vm/seg.h>
46 #include <vm/page.h>
47 #include <vm/pvn.h>
48 #include <sys/fs_subr.h>
50 #include <vm/seg_kp.h>
53 * Define the routines within this file.
55 static int swap_getpage(struct vnode *vp, offset_t off, size_t len,
56 uint_t *protp, struct page **plarr, size_t plsz, struct seg *seg,
57 caddr_t addr, enum seg_rw rw, struct cred *cr, caller_context_t *ct);
58 static int swap_putpage(struct vnode *vp, offset_t off, size_t len,
59 int flags, struct cred *cr, caller_context_t *ct);
60 static void swap_inactive(struct vnode *vp, struct cred *cr,
61 caller_context_t *ct);
62 static void swap_dispose(vnode_t *vp, page_t *pp, int fl, int dn,
63 cred_t *cr, caller_context_t *ct);
65 static int swap_getapage(struct vnode *vp, uoff_t off, size_t len,
66 uint_t *protp, page_t **plarr, size_t plsz,
67 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr);
69 int swap_getconpage(struct vnode *vp, uoff_t off, size_t len,
70 uint_t *protp, page_t **plarr, size_t plsz, page_t *conpp,
71 uint_t *pszc, spgcnt_t *nreloc, struct seg *seg, caddr_t addr,
72 enum seg_rw rw, struct cred *cr);
74 static int swap_putapage(struct vnode *vp, page_t *pp, uoff_t *off,
75 size_t *lenp, int flags, struct cred *cr);
77 const struct vnodeops swap_vnodeops = {
78 .vnop_name = "swapfs",
79 .vop_inactive = swap_inactive,
80 .vop_getpage = swap_getpage,
81 .vop_putpage = swap_putpage,
82 .vop_dispose = swap_dispose,
83 .vop_setfl = fs_nosys,
84 .vop_poll = (void *) fs_nosys,
85 .vop_pathconf = fs_nosys,
86 .vop_getsecattr = fs_nosys,
87 .vop_shrlock = fs_nosys,
90 /* ARGSUSED */
91 static void
92 swap_inactive(
93 struct vnode *vp,
94 struct cred *cr,
95 caller_context_t *ct)
97 SWAPFS_PRINT(SWAP_VOPS, "swap_inactive: vp %x\n", vp, 0, 0, 0, 0);
101 * Return all the pages from [off..off+len] in given file
103 /*ARGSUSED*/
104 static int
105 swap_getpage(
106 struct vnode *vp,
107 offset_t off,
108 size_t len,
109 uint_t *protp,
110 page_t *pl[],
111 size_t plsz,
112 struct seg *seg,
113 caddr_t addr,
114 enum seg_rw rw,
115 struct cred *cr,
116 caller_context_t *ct)
118 SWAPFS_PRINT(SWAP_VOPS, "swap_getpage: vp %p, off %llx, len %lx\n",
119 (void *)vp, off, len, 0, 0);
121 TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETPAGE,
122 "swapfs getpage:vp %p off %llx len %ld",
123 (void *)vp, off, len);
125 return (pvn_getpages(swap_getapage, vp, (uoff_t)off, len, protp,
126 pl, plsz, seg, addr, rw, cr));
130 * Called from pvn_getpages to get a particular page.
132 /*ARGSUSED*/
133 static int
134 swap_getapage(
135 struct vnode *vp,
136 uoff_t off,
137 size_t len,
138 uint_t *protp,
139 page_t *pl[],
140 size_t plsz,
141 struct seg *seg,
142 caddr_t addr,
143 enum seg_rw rw,
144 struct cred *cr)
146 struct page *pp, *rpp;
147 int flags;
148 int err = 0;
149 struct vnode *pvp = NULL;
150 uoff_t poff;
151 int flag_noreloc;
152 se_t lock;
153 int upgrade = 0;
155 SWAPFS_PRINT(SWAP_VOPS, "swap_getapage: vp %p, off %llx, len %lx\n",
156 vp, off, len, 0, 0);
159 * Until there is a call-back mechanism to cause SEGKP
160 * pages to be unlocked, make them non-relocatable.
162 if (SEG_IS_SEGKP(seg))
163 flag_noreloc = PG_NORELOC;
164 else
165 flag_noreloc = 0;
167 if (protp != NULL)
168 *protp = PROT_ALL;
170 lock = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
172 again:
173 if (pp = page_lookup(&vp->v_object, off, lock)) {
174 if (pl) {
175 if (upgrade)
176 page_downgrade(pp);
178 pl[0] = pp;
179 pl[1] = NULL;
180 } else {
181 page_unlock(pp);
183 } else {
184 pp = page_create_va(&vp->v_object, off, PAGESIZE,
185 PG_WAIT | PG_EXCL | flag_noreloc,
186 seg, addr);
188 * Someone raced in and created the page after we did the
189 * lookup but before we did the create, so go back and
190 * try to look it up again.
192 if (pp == NULL)
193 goto again;
194 if (rw != S_CREATE) {
195 err = swap_getphysname(vp, off, &pvp, &poff);
196 if (pvp) {
197 struct anon *ap;
198 kmutex_t *ahm;
200 flags = (pl == NULL ? B_ASYNC|B_READ : B_READ);
201 err = fop_pageio(pvp, pp, poff,
202 PAGESIZE, flags, cr, NULL);
204 if (!err) {
205 ahm = AH_MUTEX(vp, off);
206 mutex_enter(ahm);
208 ap = swap_anon(vp, off);
209 if (ap == NULL) {
210 panic("swap_getapage:"
211 " null anon");
214 if (ap->an_pvp == pvp &&
215 ap->an_poff == poff) {
216 swap_phys_free(pvp, poff,
217 PAGESIZE);
218 ap->an_pvp = NULL;
219 ap->an_poff = 0;
220 hat_setmod(pp);
223 mutex_exit(ahm);
225 } else {
226 if (!err)
227 pagezero(pp, 0, PAGESIZE);
230 * If it's a fault ahead, release page_io_lock
231 * and SE_EXCL we grabbed in page_create_va
233 * If we are here, we haven't called fop_pageio
234 * and thus calling pvn_read_done(pp, B_READ)
235 * below may mislead that we tried i/o. Besides,
236 * in case of async, pvn_read_done() should
237 * not be called by *getpage()
239 if (pl == NULL) {
241 * swap_getphysname can return error
242 * only when we are getting called from
243 * swapslot_free which passes non-NULL
244 * pl to fop_getpage.
246 ASSERT(err == 0);
247 page_io_unlock(pp);
248 page_unlock(pp);
253 ASSERT(pp != NULL);
255 if (err && pl)
256 pvn_read_done(pp, B_ERROR);
258 if (!err && pl)
259 pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
261 TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
262 "swapfs getapage:pp %p vp %p off %llx", pp, vp, off);
263 return (err);
267 * Called from large page anon routines only! This is an ugly hack where
268 * the anon layer directly calls into swapfs with a preallocated large page.
269 * Another method would have been to change to VOP and add an extra arg for
270 * the preallocated large page. This all could be cleaned up later when we
271 * solve the anonymous naming problem and no longer need to loop across of
272 * the VOP in PAGESIZE increments to fill in or initialize a large page as
273 * is done today. I think the latter is better since it avoid a change to
274 * the VOP interface that could later be avoided.
277 swap_getconpage(
278 struct vnode *vp,
279 uoff_t off,
280 size_t len,
281 uint_t *protp,
282 page_t *pl[],
283 size_t plsz,
284 page_t *conpp,
285 uint_t *pszc,
286 spgcnt_t *nreloc,
287 struct seg *seg,
288 caddr_t addr,
289 enum seg_rw rw,
290 struct cred *cr)
292 struct page *pp;
293 int err = 0;
294 struct vnode *pvp = NULL;
295 uoff_t poff;
297 ASSERT(len == PAGESIZE);
298 ASSERT(pl != NULL);
299 ASSERT(plsz == PAGESIZE);
300 ASSERT(protp == NULL);
301 ASSERT(nreloc != NULL);
302 ASSERT(!SEG_IS_SEGKP(seg)); /* XXX for now not supported */
303 SWAPFS_PRINT(SWAP_VOPS, "swap_getconpage: vp %p, off %llx, len %lx\n",
304 vp, off, len, 0, 0);
307 * If we are not using a preallocated page then we know one already
308 * exists. So just let the old code handle it.
310 if (conpp == NULL) {
311 err = swap_getapage(vp, (uoff_t)off, len, protp, pl, plsz,
312 seg, addr, rw, cr);
313 return (err);
315 ASSERT(conpp->p_szc != 0);
316 ASSERT(PAGE_EXCL(conpp));
319 ASSERT(conpp->p_next == conpp);
320 ASSERT(conpp->p_prev == conpp);
321 ASSERT(!PP_ISAGED(conpp));
322 ASSERT(!PP_ISFREE(conpp));
324 *nreloc = 0;
325 pp = page_lookup_create(&vp->v_object, off, SE_SHARED, conpp, nreloc,
329 * If existing page is found we may need to relocate.
331 if (pp != conpp) {
332 ASSERT(rw != S_CREATE);
333 ASSERT(pszc != NULL);
334 ASSERT(PAGE_SHARED(pp));
335 if (pp->p_szc < conpp->p_szc) {
336 *pszc = pp->p_szc;
337 page_unlock(pp);
338 err = -1;
339 } else if (pp->p_szc > conpp->p_szc &&
340 seg->s_szc > conpp->p_szc) {
341 *pszc = MIN(pp->p_szc, seg->s_szc);
342 page_unlock(pp);
343 err = -2;
344 } else {
345 pl[0] = pp;
346 pl[1] = NULL;
347 if (page_pptonum(pp) &
348 (page_get_pagecnt(conpp->p_szc) - 1))
349 cmn_err(CE_PANIC, "swap_getconpage: no root");
351 return (err);
354 ASSERT(PAGE_EXCL(pp));
356 if (*nreloc != 0) {
357 ASSERT(rw != S_CREATE);
358 pl[0] = pp;
359 pl[1] = NULL;
360 return (0);
363 *nreloc = 1;
366 * If necessary do the page io.
368 if (rw != S_CREATE) {
370 * Since we are only called now on behalf of an
371 * address space operation it's impossible for
372 * us to fail unlike swap_getapge() which
373 * also gets called from swapslot_free().
375 if (swap_getphysname(vp, off, &pvp, &poff)) {
376 cmn_err(CE_PANIC,
377 "swap_getconpage: swap_getphysname failed!");
380 if (pvp != NULL) {
381 err = fop_pageio(pvp, pp, poff, PAGESIZE, B_READ,
382 cr, NULL);
383 if (err == 0) {
384 struct anon *ap;
385 kmutex_t *ahm;
387 ahm = AH_MUTEX(vp, off);
388 mutex_enter(ahm);
389 ap = swap_anon(vp, off);
390 if (ap == NULL)
391 panic("swap_getconpage: null anon");
392 if (ap->an_pvp != pvp || ap->an_poff != poff)
393 panic("swap_getconpage: bad anon");
395 swap_phys_free(pvp, poff, PAGESIZE);
396 ap->an_pvp = NULL;
397 ap->an_poff = 0;
398 hat_setmod(pp);
399 mutex_exit(ahm);
401 } else {
402 pagezero(pp, 0, PAGESIZE);
407 * Normally we would let pvn_read_done() destroy
408 * the page on IO error. But since this is a preallocated
409 * page we'll let the anon layer handle it.
411 page_io_unlock(pp);
412 if (err != 0)
413 page_hashout(pp, false);
414 ASSERT(pp->p_next == pp);
415 ASSERT(pp->p_prev == pp);
417 TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_GETAPAGE,
418 "swapfs getconpage:pp %p vp %p off %llx", pp, vp, off);
420 pl[0] = pp;
421 pl[1] = NULL;
422 return (err);
425 /* Async putpage klustering stuff */
426 int sw_pending_size;
427 extern int klustsize;
428 extern struct async_reqs *sw_getreq();
429 extern void sw_putreq(struct async_reqs *);
430 extern void sw_putbackreq(struct async_reqs *);
431 extern struct async_reqs *sw_getfree();
432 extern void sw_putfree(struct async_reqs *);
434 static size_t swap_putpagecnt, swap_pagespushed;
435 static size_t swap_otherfail, swap_otherpages;
436 static size_t swap_klustfail, swap_klustpages;
437 static size_t swap_getiofail, swap_getiopages;
440 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
441 * If len == 0, do from off to EOF.
443 static int swap_nopage = 0; /* Don't do swap_putpage's if set */
445 /* ARGSUSED */
446 static int
447 swap_putpage(
448 struct vnode *vp,
449 offset_t off,
450 size_t len,
451 int flags,
452 struct cred *cr,
453 caller_context_t *ct)
455 page_t *pp;
456 uoff_t io_off;
457 size_t io_len = 0;
458 int err = 0;
459 int nowait;
460 struct async_reqs *arg;
462 if (swap_nopage)
463 return (0);
465 ASSERT(vp->v_count != 0);
467 nowait = flags & B_PAGE_NOWAIT;
470 * Clear force flag so that p_lckcnt pages are not invalidated.
472 flags &= ~(B_FORCE | B_PAGE_NOWAIT);
474 SWAPFS_PRINT(SWAP_VOPS,
475 "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
476 (void *)vp, off, len, flags, 0);
477 TRACE_3(TR_FAC_SWAPFS, TR_SWAPFS_PUTPAGE,
478 "swapfs putpage:vp %p off %llx len %ld", (void *)vp, off, len);
480 if (vp->v_flag & VNOMAP)
481 return (ENOSYS);
483 if (!vn_has_cached_data(vp))
484 return (0);
486 if (len == 0) {
487 if (curproc == proc_pageout)
488 cmn_err(CE_PANIC, "swapfs: pageout can't block");
490 /* Search the entire vp list for pages >= off. */
491 err = pvn_vplist_dirty(vp, (uoff_t)off, swap_putapage,
492 flags, cr);
493 } else {
494 uoff_t eoff;
497 * Loop over all offsets in the range [off...off + len]
498 * looking for pages to deal with.
500 eoff = off + len;
501 for (io_off = (uoff_t)off; io_off < eoff;
502 io_off += io_len) {
504 * If we run out of the async req slot, put the page
505 * now instead of queuing.
507 if (flags == (B_ASYNC | B_FREE) &&
508 sw_pending_size < klustsize &&
509 (arg = sw_getfree())) {
511 * If we are clustering, we should allow
512 * pageout to feed us more pages because # of
513 * pushes is limited by # of I/Os, and one
514 * cluster is considered to be one I/O.
516 if (pushes)
517 pushes--;
519 arg->a_vp = vp;
520 arg->a_off = io_off;
521 arg->a_len = PAGESIZE;
522 arg->a_flags = B_ASYNC | B_FREE;
523 arg->a_cred = kcred;
524 sw_putreq(arg);
525 io_len = PAGESIZE;
526 continue;
529 * If we are not invalidating pages, use the
530 * routine page_lookup_nowait() to prevent
531 * reclaiming them from the free list.
533 if (!nowait && ((flags & B_INVAL) ||
534 (flags & (B_ASYNC | B_FREE)) == B_FREE))
535 pp = page_lookup(&vp->v_object, io_off,
536 SE_EXCL);
537 else
538 pp = page_lookup_nowait(&vp->v_object,
539 io_off,
540 (flags & (B_FREE | B_INVAL)) ? SE_EXCL : SE_SHARED);
542 if (pp == NULL || pvn_getdirty(pp, flags) == 0)
543 io_len = PAGESIZE;
544 else {
545 err = swap_putapage(vp, pp, &io_off, &io_len,
546 flags, cr);
547 if (err != 0)
548 break;
552 /* If invalidating, verify all pages on vnode list are gone. */
553 if (err == 0 && off == 0 && len == 0 &&
554 (flags & B_INVAL) && vn_has_cached_data(vp)) {
555 cmn_err(CE_WARN,
556 "swap_putpage: B_INVAL, pages not gone");
558 return (err);
562 * Write out a single page.
563 * For swapfs this means choose a physical swap slot and write the page
564 * out using fop_pageio.
565 * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
566 * swapfs pages, a bunch of contiguous swap slots and then write them
567 * all out in one clustered i/o.
569 /*ARGSUSED*/
570 static int
571 swap_putapage(
572 struct vnode *vp,
573 page_t *pp,
574 uoff_t *offp,
575 size_t *lenp,
576 int flags,
577 struct cred *cr)
579 int err;
580 struct vnode *pvp;
581 uoff_t poff, off;
582 uoff_t doff;
583 size_t dlen;
584 size_t klsz = 0;
585 uoff_t klstart = 0;
586 struct vnode *klvp = NULL;
587 page_t *pplist;
588 se_t se;
589 struct async_reqs *arg;
590 size_t swap_klustsize;
593 * This check is added for callers who access swap_putpage with len = 0.
594 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
595 * And it's necessary to do the same queuing if users have the same
596 * B_ASYNC|B_FREE flags on.
598 if (flags == (B_ASYNC | B_FREE) &&
599 sw_pending_size < klustsize && (arg = sw_getfree())) {
601 hat_setmod(pp);
602 page_io_unlock(pp);
603 page_unlock(pp);
605 arg->a_vp = vp;
606 arg->a_off = pp->p_offset;
607 arg->a_len = PAGESIZE;
608 arg->a_flags = B_ASYNC | B_FREE;
609 arg->a_cred = kcred;
610 sw_putreq(arg);
612 return (0);
615 SWAPFS_PRINT(SWAP_PUTP,
616 "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
617 pp, vp, pp->p_offset, flags, 0);
619 ASSERT(PAGE_LOCKED(pp));
621 off = pp->p_offset;
623 doff = off;
624 dlen = PAGESIZE;
626 if (err = swap_newphysname(vp, off, &doff, &dlen, &pvp, &poff)) {
627 err = (flags == (B_ASYNC | B_FREE) ? ENOMEM : 0);
628 hat_setmod(pp);
629 page_io_unlock(pp);
630 page_unlock(pp);
631 goto out;
634 klvp = pvp;
635 klstart = poff;
636 pplist = pp;
638 * If this is ASYNC | FREE and we've accumulated a bunch of such
639 * pending requests, kluster.
641 if (flags == (B_ASYNC | B_FREE))
642 swap_klustsize = klustsize;
643 else
644 swap_klustsize = PAGESIZE;
645 se = (flags & B_FREE ? SE_EXCL : SE_SHARED);
646 klsz = PAGESIZE;
647 while (klsz < swap_klustsize) {
648 if ((arg = sw_getreq()) == NULL) {
649 swap_getiofail++;
650 swap_getiopages += btop(klsz);
651 break;
653 ASSERT(vn_matchops(arg->a_vp, &swap_vnodeops));
654 vp = arg->a_vp;
655 off = arg->a_off;
657 if ((pp = page_lookup_nowait(&vp->v_object, off, se)) == NULL) {
658 swap_otherfail++;
659 swap_otherpages += btop(klsz);
660 sw_putfree(arg);
661 break;
663 if (pvn_getdirty(pp, flags | B_DELWRI) == 0) {
664 sw_putfree(arg);
665 continue;
667 /* Get new physical backing store for the page */
668 doff = off;
669 dlen = PAGESIZE;
670 if (err = swap_newphysname(vp, off, &doff, &dlen,
671 &pvp, &poff)) {
672 swap_otherfail++;
673 swap_otherpages += btop(klsz);
674 hat_setmod(pp);
675 page_io_unlock(pp);
676 page_unlock(pp);
677 sw_putbackreq(arg);
678 break;
680 /* Try to cluster new physical name with previous ones */
681 if (klvp == pvp && poff == klstart + klsz) {
682 klsz += PAGESIZE;
683 page_add(&pplist, pp);
684 pplist = pplist->p_next;
685 sw_putfree(arg);
686 } else if (klvp == pvp && poff == klstart - PAGESIZE) {
687 klsz += PAGESIZE;
688 klstart -= PAGESIZE;
689 page_add(&pplist, pp);
690 sw_putfree(arg);
691 } else {
692 swap_klustfail++;
693 swap_klustpages += btop(klsz);
694 hat_setmod(pp);
695 page_io_unlock(pp);
696 page_unlock(pp);
697 sw_putbackreq(arg);
698 break;
702 err = fop_pageio(klvp, pplist, klstart, klsz,
703 B_WRITE | flags, cr, NULL);
705 if ((flags & B_ASYNC) == 0)
706 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
708 /* Statistics */
709 if (!err) {
710 swap_putpagecnt++;
711 swap_pagespushed += btop(klsz);
713 out:
714 TRACE_4(TR_FAC_SWAPFS, TR_SWAPFS_PUTAPAGE,
715 "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
716 vp, klvp, klstart, klsz);
717 if (err && err != ENOMEM)
718 cmn_err(CE_WARN, "swapfs_putapage: err %d\n", err);
719 if (lenp)
720 *lenp = PAGESIZE;
721 return (err);
724 static void
725 swap_dispose(
726 vnode_t *vp,
727 page_t *pp,
728 int fl,
729 int dn,
730 cred_t *cr,
731 caller_context_t *ct)
733 int err;
734 uoff_t off = pp->p_offset;
735 vnode_t *pvp;
736 uoff_t poff;
738 ASSERT(PAGE_EXCL(pp));
741 * The caller will free/invalidate large page in one shot instead of
742 * one small page at a time.
744 if (pp->p_szc != 0) {
745 page_unlock(pp);
746 return;
749 err = swap_getphysname(vp, off, &pvp, &poff);
750 if (!err && pvp != NULL)
751 fop_dispose(pvp, pp, fl, dn, cr, ct);
752 else
753 fs_dispose(vp, pp, fl, dn, cr, ct);