drop net-snmp dep
[unleashed.git] / kernel / fs / dcfs / dc_vnops.c
blob20acdc9dd95d9872ba16fa38965b2a528ec4b676
2 /*
3 * CDDL HEADER START
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2017 by Delphix. All rights reserved.
27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * University Copyright- Copyright (c) 1982, 1986, 1988
32 * The Regents of the University of California
33 * All Rights Reserved
35 * University Acknowledgment- Portions of this document are derived from
36 * software developed by the University of California, Berkeley, and its
37 * contributors.
40 #include <sys/types.h>
41 #include <sys/thread.h>
42 #include <sys/t_lock.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/buf.h>
47 #include <sys/cmn_err.h>
48 #include <sys/conf.h>
49 #include <sys/ddi.h>
50 #include <sys/debug.h>
51 #include <sys/errno.h>
52 #include <sys/time.h>
53 #include <sys/fcntl.h>
54 #include <sys/flock.h>
55 #include <sys/file.h>
56 #include <sys/kmem.h>
57 #include <sys/mman.h>
58 #include <sys/vmsystm.h>
59 #include <sys/open.h>
60 #include <sys/swap.h>
61 #include <sys/sysmacros.h>
62 #include <sys/uio.h>
63 #include <sys/vfs.h>
64 #include <sys/vnode.h>
65 #include <sys/stat.h>
66 #include <sys/poll.h>
67 #include <sys/zmod.h>
68 #include <sys/fs/decomp.h>
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/seg_vn.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/seg_map.h>
78 #include <sys/fs_subr.h>
81 * dcfs - A filesystem for automatic decompressing of fiocompressed files
83 * This filesystem is a layered filesystem that sits on top of a normal
84 * persistent filesystem and provides automatic decompression of files
85 * that have been previously compressed and stored on the host file system.
86 * This is a pseudo filesystem in that it does not persist data, rather it
87 * intercepts file lookup requests on the host filesystem and provides
88 * transparent decompression of those files. Currently the only supported
89 * host filesystem is ufs.
91 * A file is compressed via a userland utility (currently cmd/boot/fiocompress)
92 * and marked by fiocompress as a compressed file via a flag in the on-disk
93 * inode (set via a ufs ioctl() - see `ufs_vnops.c`ufs_ioctl()`_FIO_COMPRESSED
94 * ufs_lookup checks for this flag and if set, passes control to decompvp
95 * a function defined in this (dcfs) filesystem. decomvp uncompresses the file
96 * and returns a dcfs vnode to the VFS layer.
98 * dcfs is layered on top of ufs and passes requests involving persistence
99 * to the underlying ufs filesystem. The compressed files currently cannot be
100 * written to.
105 * Define data structures within this file.
107 #define DCSHFT 5
108 #define DCTABLESIZE 16
110 #if ((DCTABLESIZE & (DCTABLESIZE - 1)) == 0)
111 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) & (DCTABLESIZE - 1))
112 #else
113 #define DCHASH(vp) (((uintptr_t)(vp) >> DCSHFT) % DTABLESIZEC)
114 #endif
116 #define DCLRUSIZE 16
118 #define DCCACHESIZE 4
120 #define rounddown(x, y) ((x) & ~((y) - 1))
122 struct dcnode *dctable[DCTABLESIZE];
124 struct dcnode *dclru;
125 static int dclru_len;
127 kmutex_t dctable_lock;
129 dev_t dcdev;
130 struct vfs dc_vfs;
132 struct kmem_cache *dcnode_cache;
133 struct kmem_cache *dcbuf_cache[DCCACHESIZE];
135 kmutex_t dccache_lock;
137 static int dcinit(int, char *);
139 static struct dcnode *dcnode_alloc(void);
140 static void dcnode_free(struct dcnode *);
141 static void dcnode_recycle(struct dcnode *);
143 static void dcinsert(struct dcnode *);
144 static void dcdelete(struct dcnode *);
145 static struct dcnode *dcfind(struct vnode *);
146 static void dclru_add(struct dcnode *);
147 static void dclru_sub(struct dcnode *);
151 * This is the loadable module wrapper.
153 #include <sys/modctl.h>
155 /* yes, we want all defaults */
156 static const struct vfsops dc_vfsops;
158 static vfsdef_t vfw = {
159 VFSDEF_VERSION,
160 "dcfs",
161 dcinit,
162 VSW_ZMOUNT,
163 NULL
167 * Module linkage information for the kernel.
169 extern struct mod_ops mod_fsops;
171 static struct modlfs modlfs = {
172 &mod_fsops, "compressed filesystem", &vfw
175 static struct modlinkage modlinkage = {
176 MODREV_1, (void *)&modlfs, NULL
180 _init()
182 return (mod_install(&modlinkage));
186 _info(struct modinfo *modinfop)
188 return (mod_info(&modlinkage, modinfop));
192 static int dc_open(struct vnode **, int, struct cred *, caller_context_t *);
193 static int dc_close(struct vnode *, int, int, offset_t,
194 struct cred *, caller_context_t *);
195 static int dc_read(struct vnode *, struct uio *, int, struct cred *,
196 struct caller_context *);
197 static int dc_getattr(struct vnode *, struct vattr *, int,
198 struct cred *, caller_context_t *);
199 static int dc_setattr(struct vnode *, struct vattr *, int, struct cred *,
200 struct caller_context *);
201 static int dc_access(struct vnode *, int, int,
202 struct cred *, caller_context_t *);
203 static int dc_fsync(struct vnode *, int, struct cred *, caller_context_t *);
204 static void dc_inactive(struct vnode *, struct cred *, caller_context_t *);
205 static int dc_fid(struct vnode *, struct fid *, caller_context_t *);
206 static int dc_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
207 static int dc_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
208 struct flk_callback *, struct cred *, caller_context_t *);
209 static int dc_realvp(struct vnode *, struct vnode **, caller_context_t *);
210 static int dc_getpage(struct vnode *, offset_t, size_t, uint_t *,
211 struct page **, size_t, struct seg *, caddr_t, enum seg_rw,
212 struct cred *, caller_context_t *);
213 static int dc_putpage(struct vnode *, offset_t, size_t, int,
214 struct cred *, caller_context_t *);
215 static int dc_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
216 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
217 static int dc_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
218 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
219 static int dc_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t,
220 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
222 static const struct vnodeops dc_vnodeops = {
223 .vnop_name = "dcfs",
224 .vop_open = dc_open,
225 .vop_close = dc_close,
226 .vop_read = dc_read,
227 .vop_getattr = dc_getattr,
228 .vop_setattr = dc_setattr,
229 .vop_access = dc_access,
230 .vop_fsync = dc_fsync,
231 .vop_inactive = dc_inactive,
232 .vop_fid = dc_fid,
233 .vop_seek = dc_seek,
234 .vop_frlock = dc_frlock,
235 .vop_realvp = dc_realvp,
236 .vop_getpage = dc_getpage,
237 .vop_putpage = dc_putpage,
238 .vop_map = dc_map,
239 .vop_addmap = dc_addmap,
240 .vop_delmap = dc_delmap,
243 /*ARGSUSED*/
244 static int
245 dc_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ctp)
247 return (0);
250 /*ARGSUSED*/
251 static int
252 dc_close(struct vnode *vp, int flag, int count, offset_t off,
253 struct cred *cr, caller_context_t *ctp)
255 (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
256 cleanshares(vp, ttoproc(curthread)->p_pid);
257 return (0);
260 /*ARGSUSED*/
261 static int
262 dc_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
263 struct caller_context *ct)
265 struct dcnode *dp = VTODC(vp);
266 size_t rdsize = MAX(MAXBSIZE, dp->dc_hdr->ch_blksize);
267 size_t fsize = dp->dc_hdr->ch_fsize;
268 int error;
271 * Loop through file with segmap, decompression will occur
272 * in dc_getapage
274 do {
275 caddr_t base;
276 size_t n;
277 offset_t mapon;
280 * read to end of block or file
282 mapon = uiop->uio_loffset & (rdsize - 1);
283 n = MIN(rdsize - mapon, uiop->uio_resid);
284 n = MIN(n, fsize - uiop->uio_loffset);
285 if (n == 0)
286 return (0); /* at EOF */
288 base = segmap_getmapflt(segkmap, vp, uiop->uio_loffset, n, 1,
289 S_READ);
290 error = uiomove(base + mapon, n, UIO_READ, uiop);
291 if (!error) {
292 uint_t flags;
294 if (n + mapon == rdsize || uiop->uio_loffset == fsize)
295 flags = SM_DONTNEED;
296 else
297 flags = 0;
298 error = segmap_release(segkmap, base, flags);
299 } else
300 (void) segmap_release(segkmap, base, 0);
301 } while (!error && uiop->uio_resid);
303 return (error);
306 static int
307 dc_getattr(struct vnode *vp, struct vattr *vap, int flags,
308 cred_t *cred, caller_context_t *ctp)
310 struct dcnode *dp = VTODC(vp);
311 struct vnode *subvp = dp->dc_subvp;
312 int error;
314 error = fop_getattr(subvp, vap, flags, cred, ctp);
316 /* substitute uncompressed size */
317 vap->va_size = dp->dc_hdr->ch_fsize;
318 return (error);
321 static int
322 dc_setattr(struct vnode *vp, struct vattr *vap, int flags, cred_t *cred,
323 caller_context_t *ctp)
325 struct dcnode *dp = VTODC(vp);
326 struct vnode *subvp = dp->dc_subvp;
328 return (fop_setattr(subvp, vap, flags, cred, ctp));
331 static int
332 dc_access(struct vnode *vp, int mode, int flags,
333 cred_t *cred, caller_context_t *ctp)
335 struct dcnode *dp = VTODC(vp);
336 struct vnode *subvp = dp->dc_subvp;
338 return (fop_access(subvp, mode, flags, cred, ctp));
341 /*ARGSUSED*/
342 static int
343 dc_fsync(vnode_t *vp, int syncflag, cred_t *cred, caller_context_t *ctp)
345 return (0);
348 /*ARGSUSED*/
349 static void
350 dc_inactive(struct vnode *vp, cred_t *cr, caller_context_t *ctp)
352 struct dcnode *dp = VTODC(vp);
354 mutex_enter(&dctable_lock);
355 mutex_enter(&vp->v_lock);
356 ASSERT(vp->v_count >= 1);
357 VN_RELE_LOCKED(vp);
358 if (vp->v_count != 0) {
360 * Somebody accessed the dcnode before we got a chance to
361 * remove it. They will remove it when they do a vn_rele.
363 mutex_exit(&vp->v_lock);
364 mutex_exit(&dctable_lock);
365 return;
367 mutex_exit(&vp->v_lock);
369 dcnode_free(dp);
371 mutex_exit(&dctable_lock);
374 static int
375 dc_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ctp)
377 struct dcnode *dp = VTODC(vp);
378 struct vnode *subvp = dp->dc_subvp;
380 return (fop_fid(subvp, fidp, ctp));
383 static int
384 dc_seek(struct vnode *vp, offset_t oof, offset_t *noffp, caller_context_t *ctp)
386 struct dcnode *dp = VTODC(vp);
387 struct vnode *subvp = dp->dc_subvp;
389 return (fop_seek(subvp, oof, noffp, ctp));
392 static int
393 dc_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
394 offset_t offset, struct flk_callback *flk_cbp,
395 cred_t *cr, caller_context_t *ctp)
397 struct dcnode *dp = VTODC(vp);
398 int error;
399 struct vattr vattr;
402 * If file is being mapped, disallow frlock.
404 vattr.va_mask = VATTR_MODE;
405 if (error = fop_getattr(dp->dc_subvp, &vattr, 0, cr, ctp))
406 return (error);
407 if (dp->dc_mapcnt > 0 && MANDLOCK(vp, vattr.va_mode))
408 return (EAGAIN);
410 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ctp));
413 /*ARGSUSED*/
414 static int
415 dc_getblock_miss(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
416 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
418 struct dcnode *dp = VTODC(vp);
419 struct comphdr *hdr = dp->dc_hdr;
420 struct page *pp;
421 struct buf *bp;
422 caddr_t saddr;
423 off_t cblkno;
424 size_t rdoff, rdsize, dsize;
425 long xlen;
426 int error, zerr;
428 ASSERT(len == hdr->ch_blksize);
430 * Get destination pages and make them addressable
432 pp = page_create_va(&vp->v_object, off, len, PG_WAIT, seg, addr);
433 bp = pageio_setup(pp, len, vp, B_READ);
434 bp_mapin(bp);
437 * read compressed data from subordinate vnode
439 saddr = kmem_cache_alloc(dp->dc_bufcache, KM_SLEEP);
440 cblkno = off / len;
441 rdoff = hdr->ch_blkmap[cblkno];
442 rdsize = hdr->ch_blkmap[cblkno + 1] - rdoff;
443 error = vn_rdwr(UIO_READ, dp->dc_subvp, saddr, rdsize, rdoff,
444 UIO_SYSSPACE, 0, 0, cr, NULL);
445 if (error)
446 goto cleanup;
449 * Uncompress
451 dsize = len;
452 zerr = z_uncompress(bp->b_un.b_addr, &dsize, saddr, dp->dc_zmax);
453 if (zerr != Z_OK) {
454 error = EIO;
455 goto cleanup;
459 * Handle EOF
461 xlen = hdr->ch_fsize - off;
462 if (xlen < len) {
463 bzero(bp->b_un.b_addr + xlen, len - xlen);
464 if (dsize != xlen)
465 error = EIO;
466 } else if (dsize != len)
467 error = EIO;
470 * Clean up
472 cleanup:
473 kmem_cache_free(dp->dc_bufcache, saddr);
474 pageio_done(bp);
475 *ppp = pp;
476 return (error);
479 static int
480 dc_getblock(struct vnode *vp, offset_t off, size_t len, struct page **ppp,
481 struct seg *seg, caddr_t addr, enum seg_rw rw, struct cred *cr)
483 struct page *pp, *plist = NULL;
484 offset_t pgoff;
485 int rdblk;
488 * pvn_read_kluster() doesn't quite do what we want, since it
489 * thinks sub block reads are ok. Here we always decompress
490 * a full block.
494 * Check page cache
496 rdblk = 0;
497 for (pgoff = off; pgoff < off + len; pgoff += PAGESIZE) {
498 pp = page_lookup(&vp->v_object, pgoff, SE_EXCL);
499 if (pp == NULL) {
500 rdblk = 1;
501 break;
503 page_io_lock(pp);
504 page_add(&plist, pp);
505 plist = plist->p_next;
507 if (!rdblk) {
508 *ppp = plist;
509 return (0); /* all pages in cache */
513 * Undo any locks so getblock_miss has an open field
515 if (plist != NULL)
516 pvn_io_done(plist);
518 return (dc_getblock_miss(vp, off, len, ppp, seg, addr, rw, cr));
521 static int
522 dc_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
524 struct vnode *rvp;
526 vp = VTODC(vp)->dc_subvp;
527 if (fop_realvp(vp, &rvp, ct) == 0)
528 vp = rvp;
529 *vpp = vp;
530 return (0);
533 /*ARGSUSED10*/
534 static int
535 dc_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
536 struct page *pl[], size_t plsz, struct seg *seg, caddr_t addr,
537 enum seg_rw rw, struct cred *cr, caller_context_t *ctp)
539 struct dcnode *dp = VTODC(vp);
540 struct comphdr *hdr = dp->dc_hdr;
541 struct page *pp, *plist = NULL;
542 caddr_t vp_baddr;
543 offset_t vp_boff, vp_bend;
544 size_t bsize = hdr->ch_blksize;
545 int nblks, error;
547 /* does not support write */
548 if (rw == S_WRITE) {
549 panic("write attempt on compressed file");
550 /*NOTREACHED*/
553 if (protp)
554 *protp = PROT_ALL;
556 * We don't support asynchronous operation at the moment, so
557 * just pretend we did it. If the pages are ever actually
558 * needed, they'll get brought in then.
560 if (pl == NULL)
561 return (0);
564 * Calc block start and end offsets
566 vp_boff = rounddown(off, bsize);
567 vp_bend = roundup(off + len, bsize);
568 vp_baddr = (caddr_t)rounddown((uintptr_t)addr, bsize);
570 nblks = (vp_bend - vp_boff) / bsize;
571 while (nblks--) {
572 error = dc_getblock(vp, vp_boff, bsize, &pp, seg, vp_baddr,
573 rw, cr);
574 page_list_concat(&plist, &pp);
575 vp_boff += bsize;
576 vp_baddr += bsize;
578 if (!error)
579 pvn_plist_init(plist, pl, plsz, off, len, rw);
580 else
581 pvn_read_done(plist, B_ERROR);
582 return (error);
586 * This function should never be called. We need to have it to pass
587 * it as an argument to other functions.
589 /*ARGSUSED*/
590 static int
591 dc_putapage(struct vnode *vp, struct page *pp, uoff_t *offp, size_t *lenp,
592 int flags, struct cred *cr)
594 /* should never happen */
595 cmn_err(CE_PANIC, "dcfs: dc_putapage: dirty page");
596 /*NOTREACHED*/
597 return (0);
602 * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
603 * B_INVAL is set by:
605 * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
606 * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
607 * which translates to an MC_SYNC with the MS_INVALIDATE flag.
609 * The B_FREE (as well as the B_DONTNEED) flag is set when the
610 * MADV_SEQUENTIAL advice has been used. fop_putpage is invoked
611 * from SEGVN to release pages behind a pagefault.
613 /*ARGSUSED5*/
614 static int
615 dc_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
616 struct cred *cr, caller_context_t *ctp)
618 int error = 0;
620 if (vp->v_count == 0) {
621 panic("dcfs_putpage: bad v_count");
622 /*NOTREACHED*/
625 if (vp->v_flag & VNOMAP)
626 return (ENOSYS);
628 if (!vn_has_cached_data(vp)) /* no pages mapped */
629 return (0);
631 if (len == 0) /* from 'off' to EOF */
632 error = pvn_vplist_dirty(vp, off, dc_putapage, flags, cr);
633 else {
634 offset_t io_off;
635 se_t se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
637 for (io_off = off; io_off < off + len; io_off += PAGESIZE) {
638 page_t *pp;
641 * We insist on getting the page only if we are
642 * about to invalidate, free or write it and
643 * the B_ASYNC flag is not set.
645 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0))
646 pp = page_lookup(&vp->v_object, io_off, se);
647 else
648 pp = page_lookup_nowait(&vp->v_object,
649 io_off, se);
651 if (pp == NULL)
652 continue;
654 * Normally pvn_getdirty() should return 0, which
655 * impies that it has done the job for us.
656 * The shouldn't-happen scenario is when it returns 1.
657 * This means that the page has been modified and
658 * needs to be put back.
659 * Since we can't write to a dcfs compressed file,
660 * we fake a failed I/O and force pvn_write_done()
661 * to destroy the page.
663 if (pvn_getdirty(pp, flags) == 1) {
664 cmn_err(CE_NOTE, "dc_putpage: dirty page");
665 pvn_write_done(pp, flags |
666 B_ERROR | B_WRITE | B_INVAL | B_FORCE);
670 return (error);
673 static int
674 dc_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
675 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
676 struct cred *cred, caller_context_t *ctp)
678 struct vattr vattr;
679 struct segvn_crargs vn_a;
680 int error;
682 if (vp->v_flag & VNOMAP)
683 return (ENOSYS);
685 if (off < 0 || (offset_t)(off + len) < 0)
686 return (ENXIO);
689 * If file is being locked, disallow mapping.
691 if (error = fop_getattr(VTODC(vp)->dc_subvp, &vattr, 0, cred, ctp))
692 return (error);
693 if (vn_has_mandatory_locks(vp, vattr.va_mode))
694 return (EAGAIN);
696 as_rangelock(as);
698 if ((flags & MAP_FIXED) == 0) {
699 map_addr(addrp, len, off, 1, flags);
700 if (*addrp == NULL) {
701 as_rangeunlock(as);
702 return (ENOMEM);
704 } else {
706 * User specified address - blow away any previous mappings
708 (void) as_unmap(as, *addrp, len);
711 vn_a.vp = vp;
712 vn_a.offset = off;
713 vn_a.type = flags & MAP_TYPE;
714 vn_a.prot = prot;
715 vn_a.maxprot = maxprot;
716 vn_a.flags = flags & ~MAP_TYPE;
717 vn_a.cred = cred;
718 vn_a.amp = NULL;
719 vn_a.szc = 0;
720 vn_a.lgrp_mem_policy_flags = 0;
722 error = as_map(as, *addrp, len, segvn_create, &vn_a);
723 as_rangeunlock(as);
724 return (error);
727 /*ARGSUSED*/
728 static int
729 dc_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
730 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
731 struct cred *cr, caller_context_t *ctp)
733 struct dcnode *dp;
735 if (vp->v_flag & VNOMAP)
736 return (ENOSYS);
738 dp = VTODC(vp);
739 mutex_enter(&dp->dc_lock);
740 dp->dc_mapcnt += btopr(len);
741 mutex_exit(&dp->dc_lock);
742 return (0);
745 /*ARGSUSED*/
746 static int
747 dc_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
748 size_t len, uint_t prot, uint_t maxprot, uint_t flags,
749 struct cred *cr, caller_context_t *ctp)
751 struct dcnode *dp;
753 if (vp->v_flag & VNOMAP)
754 return (ENOSYS);
756 dp = VTODC(vp);
757 mutex_enter(&dp->dc_lock);
758 dp->dc_mapcnt -= btopr(len);
759 ASSERT(dp->dc_mapcnt >= 0);
760 mutex_exit(&dp->dc_lock);
761 return (0);
765 * Constructor/destructor routines for dcnodes
767 /*ARGSUSED1*/
768 static int
769 dcnode_constructor(void *buf, void *cdrarg, int kmflags)
771 struct dcnode *dp = buf;
772 struct vnode *vp;
774 vp = dp->dc_vp = vn_alloc(kmflags);
775 if (vp == NULL) {
776 return (-1);
778 vp->v_data = dp;
779 vp->v_type = VREG;
780 vp->v_flag = VNOSWAP;
781 vp->v_vfsp = &dc_vfs;
782 vn_setops(vp, &dc_vnodeops);
783 vn_exists(vp);
785 mutex_init(&dp->dc_lock, NULL, MUTEX_DEFAULT, NULL);
786 dp->dc_mapcnt = 0;
787 dp->dc_lrunext = dp->dc_lruprev = NULL;
788 dp->dc_hdr = NULL;
789 dp->dc_subvp = NULL;
790 return (0);
793 /*ARGSUSED*/
794 static void
795 dcnode_destructor(void *buf, void *cdrarg)
797 struct dcnode *dp = buf;
798 struct vnode *vp = DCTOV(dp);
800 mutex_destroy(&dp->dc_lock);
802 VERIFY(dp->dc_hdr == NULL);
803 VERIFY(dp->dc_subvp == NULL);
804 vn_invalid(vp);
805 vn_free(vp);
808 static struct dcnode *
809 dcnode_alloc(void)
811 struct dcnode *dp;
814 * If the free list is above DCLRUSIZE
815 * re-use one from it
817 mutex_enter(&dctable_lock);
818 if (dclru_len < DCLRUSIZE) {
819 mutex_exit(&dctable_lock);
820 dp = kmem_cache_alloc(dcnode_cache, KM_SLEEP);
821 } else {
822 ASSERT(dclru != NULL);
823 dp = dclru;
824 dclru_sub(dp);
825 dcdelete(dp);
826 mutex_exit(&dctable_lock);
827 dcnode_recycle(dp);
829 return (dp);
832 static void
833 dcnode_free(struct dcnode *dp)
835 struct vnode *vp = DCTOV(dp);
837 ASSERT(MUTEX_HELD(&dctable_lock));
840 * If no cached pages, no need to put it on lru
842 if (!vn_has_cached_data(vp)) {
843 dcdelete(dp);
844 dcnode_recycle(dp);
845 kmem_cache_free(dcnode_cache, dp);
846 return;
850 * Add to lru, if it's over the limit, free from head
852 dclru_add(dp);
853 if (dclru_len > DCLRUSIZE) {
854 dp = dclru;
855 dclru_sub(dp);
856 dcdelete(dp);
857 dcnode_recycle(dp);
858 kmem_cache_free(dcnode_cache, dp);
862 static void
863 dcnode_recycle(struct dcnode *dp)
865 struct vnode *vp;
867 vp = DCTOV(dp);
869 VN_RELE(dp->dc_subvp);
870 dp->dc_subvp = NULL;
871 (void) pvn_vplist_dirty(vp, 0, dc_putapage, B_INVAL, NULL);
872 kmem_free(dp->dc_hdr, dp->dc_hdrsize);
873 dp->dc_hdr = NULL;
874 dp->dc_hdrsize = dp->dc_zmax = 0;
875 dp->dc_bufcache = NULL;
876 dp->dc_mapcnt = 0;
877 vn_reinit(vp);
878 vp->v_type = VREG;
879 vp->v_flag = VNOSWAP;
880 vp->v_vfsp = &dc_vfs;
883 static int
884 dcinit(int fstype, char *name)
886 int error;
887 major_t dev;
889 error = vfs_setfsops(fstype, &dc_vfsops);
890 if (error) {
891 cmn_err(CE_WARN, "dcinit: bad fstype");
892 return (error);
894 VFS_INIT(&dc_vfs, &dc_vfsops, NULL);
895 dc_vfs.vfs_flag = VFS_RDONLY;
896 dc_vfs.vfs_fstype = fstype;
897 if ((dev = getudev()) == (major_t)-1)
898 dev = 0;
899 dcdev = makedevice(dev, 0);
900 dc_vfs.vfs_dev = dcdev;
902 mutex_init(&dctable_lock, NULL, MUTEX_DEFAULT, NULL);
903 mutex_init(&dccache_lock, NULL, MUTEX_DEFAULT, NULL);
904 dcnode_cache = kmem_cache_create("dcnode_cache", sizeof (struct dcnode),
905 0, dcnode_constructor, dcnode_destructor, NULL, NULL, NULL, 0);
907 return (0);
911 * Return shadow vnode with the given vp as its subordinate
913 struct vnode *
914 decompvp(struct vnode *vp, cred_t *cred, caller_context_t *ctp)
916 struct dcnode *dp, *ndp;
917 struct comphdr thdr, *hdr;
918 struct kmem_cache **cpp;
919 struct vattr vattr;
920 size_t hdrsize, bsize;
921 int error;
924 * See if we have an existing shadow
925 * If none, we have to manufacture one
927 mutex_enter(&dctable_lock);
928 dp = dcfind(vp);
929 mutex_exit(&dctable_lock);
930 if (dp != NULL)
931 return (DCTOV(dp));
934 * Make sure it's a valid compressed file
936 hdr = &thdr;
937 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, sizeof (struct comphdr), 0,
938 UIO_SYSSPACE, 0, 0, cred, NULL);
939 if (error || hdr->ch_magic != CH_MAGIC_ZLIB ||
940 hdr->ch_version != CH_VERSION || hdr->ch_algorithm != CH_ALG_ZLIB ||
941 hdr->ch_fsize == 0 || hdr->ch_blksize < PAGESIZE ||
942 hdr->ch_blksize > ptob(DCCACHESIZE) || !ISP2(hdr->ch_blksize))
943 return (NULL);
945 /* get underlying file size */
946 if (fop_getattr(vp, &vattr, 0, cred, ctp) != 0)
947 return (NULL);
950 * Re-read entire header
952 hdrsize = hdr->ch_blkmap[0] + sizeof (uint64_t);
953 hdr = kmem_alloc(hdrsize, KM_SLEEP);
954 error = vn_rdwr(UIO_READ, vp, (caddr_t)hdr, hdrsize, 0, UIO_SYSSPACE,
955 0, 0, cred, NULL);
956 if (error) {
957 kmem_free(hdr, hdrsize);
958 return (NULL);
962 * add extra blkmap entry to make dc_getblock()'s
963 * life easier
965 bsize = hdr->ch_blksize;
966 hdr->ch_blkmap[((hdr->ch_fsize-1) / bsize) + 1] = vattr.va_size;
968 ndp = dcnode_alloc();
969 ndp->dc_subvp = vp;
970 VN_HOLD(vp);
971 ndp->dc_hdr = hdr;
972 ndp->dc_hdrsize = hdrsize;
975 * Allocate kmem cache if none there already
977 ndp->dc_zmax = ZMAXBUF(bsize);
978 cpp = &dcbuf_cache[btop(bsize)];
979 mutex_enter(&dccache_lock);
980 if (*cpp == NULL)
981 *cpp = kmem_cache_create("dcbuf_cache", ndp->dc_zmax, 0, NULL,
982 NULL, NULL, NULL, NULL, 0);
983 mutex_exit(&dccache_lock);
984 ndp->dc_bufcache = *cpp;
987 * Recheck table in case someone else created shadow
988 * while we were blocked above.
990 mutex_enter(&dctable_lock);
991 dp = dcfind(vp);
992 if (dp != NULL) {
993 mutex_exit(&dctable_lock);
994 dcnode_recycle(ndp);
995 kmem_cache_free(dcnode_cache, ndp);
996 return (DCTOV(dp));
998 dcinsert(ndp);
999 mutex_exit(&dctable_lock);
1001 return (DCTOV(ndp));
1006 * dcnode lookup table
1007 * These routines maintain a table of dcnodes hashed by their
1008 * subordinate vnode so that they can be found if they already
1009 * exist in the vnode cache
1013 * Put a dcnode in the table.
1015 static void
1016 dcinsert(struct dcnode *newdp)
1018 int idx = DCHASH(newdp->dc_subvp);
1020 ASSERT(MUTEX_HELD(&dctable_lock));
1021 newdp->dc_hash = dctable[idx];
1022 dctable[idx] = newdp;
1026 * Remove a dcnode from the hash table.
1028 void
1029 dcdelete(struct dcnode *deldp)
1031 int idx = DCHASH(deldp->dc_subvp);
1032 struct dcnode *dp, *prevdp;
1034 ASSERT(MUTEX_HELD(&dctable_lock));
1035 dp = dctable[idx];
1036 if (dp == deldp)
1037 dctable[idx] = dp->dc_hash;
1038 else {
1039 for (prevdp = dp, dp = dp->dc_hash; dp != NULL;
1040 prevdp = dp, dp = dp->dc_hash) {
1041 if (dp == deldp) {
1042 prevdp->dc_hash = dp->dc_hash;
1043 break;
1047 ASSERT(dp != NULL);
1051 * Find a shadow vnode in the dctable hash list.
1053 static struct dcnode *
1054 dcfind(struct vnode *vp)
1056 struct dcnode *dp;
1058 ASSERT(MUTEX_HELD(&dctable_lock));
1059 for (dp = dctable[DCHASH(vp)]; dp != NULL; dp = dp->dc_hash)
1060 if (dp->dc_subvp == vp) {
1061 VN_HOLD(DCTOV(dp));
1062 if (dp->dc_lrunext)
1063 dclru_sub(dp);
1064 return (dp);
1066 return (NULL);
1069 #ifdef DEBUG
1070 static int
1071 dclru_count(void)
1073 struct dcnode *dp;
1074 int i = 0;
1076 if (dclru == NULL)
1077 return (0);
1078 for (dp = dclru; dp->dc_lrunext != dclru; dp = dp->dc_lrunext)
1079 i++;
1080 return (i + 1);
1082 #endif
1084 static void
1085 dclru_add(struct dcnode *dp)
1088 * Add to dclru as double-link chain
1090 ASSERT(MUTEX_HELD(&dctable_lock));
1091 if (dclru == NULL) {
1092 dclru = dp;
1093 dp->dc_lruprev = dp->dc_lrunext = dp;
1094 } else {
1095 struct dcnode *last = dclru->dc_lruprev;
1097 dclru->dc_lruprev = dp;
1098 last->dc_lrunext = dp;
1099 dp->dc_lruprev = last;
1100 dp->dc_lrunext = dclru;
1102 dclru_len++;
1103 ASSERT(dclru_len == dclru_count());
1106 static void
1107 dclru_sub(struct dcnode *dp)
1109 ASSERT(MUTEX_HELD(&dctable_lock));
1110 dp->dc_lrunext->dc_lruprev = dp->dc_lruprev;
1111 dp->dc_lruprev->dc_lrunext = dp->dc_lrunext;
1112 if (dp == dclru)
1113 dclru = dp->dc_lrunext == dp ? NULL : dp->dc_lrunext;
1114 dp->dc_lrunext = dp->dc_lruprev = NULL;
1115 dclru_len--;
1116 ASSERT(dclru_len == dclru_count());