Merge commit 'dec267e7ea9828898b1c64462daa6636c4ef5e29'
[unleashed.git] / kernel / fs / gfs.c
blob67f5ad36e0e040a26f81d06c4ec91c282cd489bd
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
21 /* Portions Copyright 2007 Shivakumar GN */
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 * Copyright (c) 2017 by Delphix. All rights reserved.
30 #include <sys/types.h>
31 #include <sys/cmn_err.h>
32 #include <sys/debug.h>
33 #include <sys/dirent.h>
34 #include <sys/kmem.h>
35 #include <sys/mman.h>
36 #include <sys/mutex.h>
37 #include <sys/sysmacros.h>
38 #include <sys/systm.h>
39 #include <sys/sunddi.h>
40 #include <sys/uio.h>
41 #include <sys/vmsystm.h>
42 #include <sys/vfs.h>
43 #include <sys/vnode.h>
45 #include <vm/as.h>
46 #include <vm/seg_vn.h>
48 #include <sys/gfs.h>
51 * Generic pseudo-filesystem routines.
53 * There are significant similarities between the implementation of certain file
54 * system entry points across different filesystems. While one could attempt to
55 * "choke up on the bat" and incorporate common functionality into a VOP
56 * preamble or postamble, such an approach is limited in the benefit it can
57 * provide. In this file we instead define a toolkit of routines which can be
58 * called from a filesystem (with in-kernel pseudo-filesystems being the focus
59 * of the exercise) in a more component-like fashion.
61 * There are three basic classes of routines:
63 * 1) Lowlevel support routines
65 * These routines are designed to play a support role for existing
66 * pseudo-filesystems (such as procfs). They simplify common tasks,
67 * without forcing the filesystem to hand over management to GFS. The
68 * routines covered are:
70 * gfs_readdir_init()
71 * gfs_readdir_emit()
72 * gfs_readdir_emitn()
73 * gfs_readdir_pred()
74 * gfs_readdir_fini()
75 * gfs_lookup_dot()
77 * 2) Complete GFS management
79 * These routines take a more active role in management of the
80 * pseudo-filesystem. They handle the relationship between vnode private
81 * data and VFS data, as well as the relationship between vnodes in the
82 * directory hierarchy.
84 * In order to use these interfaces, the first member of every private
85 * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control
86 * to GFS.
88 * gfs_file_create()
89 * gfs_dir_create()
90 * gfs_root_create()
92 * gfs_file_inactive()
93 * gfs_dir_inactive()
94 * gfs_dir_lookup()
95 * gfs_dir_readdir()
97 * gfs_vop_inactive()
98 * gfs_vop_lookup()
99 * gfs_vop_readdir()
100 * gfs_vop_map()
102 * 3) Single File pseudo-filesystems
104 * This routine creates a rooted file to be overlayed ontop of another
105 * file in the physical filespace.
107 * Note that the parent is NULL (actually the vfs), but there is nothing
108 * technically keeping such a file from utilizing the "Complete GFS
109 * management" set of routines.
111 * gfs_root_create_file()
115 * Low level directory routines
117 * These routines provide some simple abstractions for reading directories.
118 * They are designed to be used by existing pseudo filesystems (namely procfs)
119 * that already have a complicated management infrastructure.
123 * gfs_get_parent_ino: used to obtain a parent inode number and the
124 * inode number of the given vnode in preparation for calling gfs_readdir_init.
127 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
128 ino64_t *pino, ino64_t *ino)
130 vnode_t *parent;
131 gfs_dir_t *dp = dvp->v_data;
132 int error;
134 *ino = dp->gfsd_file.gfs_ino;
135 parent = dp->gfsd_file.gfs_parent;
137 if (parent == NULL) {
138 *pino = *ino; /* root of filesystem */
139 } else if (dvp->v_flag & V_XATTRDIR) {
140 vattr_t va;
142 va.va_mask = AT_NODEID;
143 error = fop_getattr(parent, &va, 0, cr, ct);
144 if (error)
145 return (error);
146 *pino = va.va_nodeid;
147 } else {
148 *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
151 return (0);
155 * gfs_readdir_init: initiate a generic readdir
156 * st - a pointer to an uninitialized gfs_readdir_state_t structure
157 * name_max - the directory's maximum file name length
158 * ureclen - the exported file-space record length (1 for non-legacy FSs)
159 * uiop - the uiop passed to readdir
160 * parent - the parent directory's inode
161 * self - this directory's inode
162 * flags - flags from fop_readdir
164 * Returns 0 or a non-zero errno.
166 * Typical fop_readdir usage of gfs_readdir_*:
168 * if ((error = gfs_readdir_init(...)) != 0)
169 * return (error);
170 * eof = 0;
171 * while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
172 * if (!consumer_entry_at(voffset))
173 * voffset = consumer_next_entry(voffset);
174 * if (consumer_eof(voffset)) {
175 * eof = 1
176 * break;
178 * if ((error = gfs_readdir_emit(..., voffset,
179 * consumer_ino(voffset), consumer_name(voffset))) != 0)
180 * break;
182 * return (gfs_readdir_fini(..., error, eofp, eof));
184 * As you can see, a zero result from gfs_readdir_pred() or
185 * gfs_readdir_emit() indicates that processing should continue,
186 * whereas a non-zero result indicates that the loop should terminate.
187 * Most consumers need do nothing more than let gfs_readdir_fini()
188 * determine what the cause of failure was and return the appropriate
189 * value.
192 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
193 uio_t *uiop, ino64_t parent, ino64_t self, int flags)
195 size_t dirent_size;
197 if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
198 (uiop->uio_loffset % ureclen) != 0)
199 return (EINVAL);
201 st->grd_ureclen = ureclen;
202 st->grd_oresid = uiop->uio_resid;
203 st->grd_namlen = name_max;
204 if (flags & V_RDDIR_ENTFLAGS)
205 dirent_size = EDIRENT_RECLEN(st->grd_namlen);
206 else
207 dirent_size = DIRENT64_RECLEN(st->grd_namlen);
208 st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
209 st->grd_parent = parent;
210 st->grd_self = self;
211 st->grd_flags = flags;
213 return (0);
217 * gfs_readdir_emit_int: internal routine to emit directory entry
219 * st - the current readdir state, which must have d_ino/ed_ino
220 * and d_name/ed_name set
221 * uiop - caller-supplied uio pointer
222 * next - the offset of the next entry
224 static int
225 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
227 int reclen;
228 dirent64_t *dp;
229 edirent_t *edp;
231 if (st->grd_flags & V_RDDIR_ENTFLAGS) {
232 edp = st->grd_dirent;
233 reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
234 } else {
235 dp = st->grd_dirent;
236 reclen = DIRENT64_RECLEN(strlen(dp->d_name));
239 if (reclen > uiop->uio_resid) {
241 * Error if no entries were returned yet
243 if (uiop->uio_resid == st->grd_oresid)
244 return (EINVAL);
245 return (-1);
248 if (st->grd_flags & V_RDDIR_ENTFLAGS) {
249 edp->ed_off = next;
250 edp->ed_reclen = (ushort_t)reclen;
251 } else {
252 dp->d_off = next;
253 dp->d_reclen = (ushort_t)reclen;
256 if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
257 return (EFAULT);
259 uiop->uio_loffset = next;
261 return (0);
265 * gfs_readdir_emit: emit a directory entry
266 * voff - the virtual offset (obtained from gfs_readdir_pred)
267 * ino - the entry's inode
268 * name - the entry's name
269 * eflags - value for ed_eflags (if processing edirent_t)
271 * Returns a 0 on success, a non-zero errno on failure, or -1 if the
272 * readdir loop should terminate. A non-zero result (either errno or
273 * -1) from this function is typically passed directly to
274 * gfs_readdir_fini().
277 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
278 ino64_t ino, const char *name, int eflags)
280 offset_t off = (voff + 2) * st->grd_ureclen;
282 if (st->grd_flags & V_RDDIR_ENTFLAGS) {
283 edirent_t *edp = st->grd_dirent;
285 edp->ed_ino = ino;
286 (void) strncpy(edp->ed_name, name, st->grd_namlen);
287 edp->ed_eflags = eflags;
288 } else {
289 dirent64_t *dp = st->grd_dirent;
291 dp->d_ino = ino;
292 (void) strncpy(dp->d_name, name, st->grd_namlen);
296 * Inter-entry offsets are invalid, so we assume a record size of
297 * grd_ureclen and explicitly set the offset appropriately.
299 return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
303 * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
304 * instead of a string for the entry's name.
307 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
308 ino64_t ino, unsigned long num)
310 char buf[40];
312 numtos(num, buf);
313 return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
317 * gfs_readdir_pred: readdir loop predicate
318 * voffp - a pointer in which the next virtual offset should be stored
320 * Returns a 0 on success, a non-zero errno on failure, or -1 if the
321 * readdir loop should terminate. A non-zero result (either errno or
322 * -1) from this function is typically passed directly to
323 * gfs_readdir_fini().
326 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
328 offset_t off, voff;
329 int error;
331 top:
332 if (uiop->uio_resid <= 0)
333 return (-1);
335 off = uiop->uio_loffset / st->grd_ureclen;
336 voff = off - 2;
337 if (off == 0) {
338 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
339 ".", 0)) == 0)
340 goto top;
341 } else if (off == 1) {
342 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
343 "..", 0)) == 0)
344 goto top;
345 } else {
346 *voffp = voff;
347 return (0);
350 return (error);
354 * gfs_readdir_fini: generic readdir cleanup
355 * error - if positive, an error to return
356 * eofp - the eofp passed to readdir
357 * eof - the eof value
359 * Returns a 0 on success, a non-zero errno on failure. This result
360 * should be returned from readdir.
363 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
365 size_t dirent_size;
367 if (st->grd_flags & V_RDDIR_ENTFLAGS)
368 dirent_size = EDIRENT_RECLEN(st->grd_namlen);
369 else
370 dirent_size = DIRENT64_RECLEN(st->grd_namlen);
371 kmem_free(st->grd_dirent, dirent_size);
372 if (error > 0)
373 return (error);
374 if (eofp)
375 *eofp = eof;
376 return (0);
380 * gfs_lookup_dot
382 * Performs a basic check for "." and ".." directory entries.
385 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
387 if (*nm == '\0' || strcmp(nm, ".") == 0) {
388 VN_HOLD(dvp);
389 *vpp = dvp;
390 return (0);
391 } else if (strcmp(nm, "..") == 0) {
392 if (pvp == NULL) {
393 ASSERT(dvp->v_flag & VROOT);
394 VN_HOLD(dvp);
395 *vpp = dvp;
396 } else {
397 VN_HOLD(pvp);
398 *vpp = pvp;
400 return (0);
403 return (-1);
407 * gfs_file_create(): create a new GFS file
409 * size - size of private data structure (v_data)
410 * pvp - parent vnode (GFS directory)
411 * ops - vnode operations vector
413 * In order to use this interface, the parent vnode must have been created by
414 * gfs_dir_create(), and the private data stored in v_data must have a
415 * 'gfs_file_t' as its first field.
417 * Given these constraints, this routine will automatically:
419 * - Allocate v_data for the vnode
420 * - Initialize necessary fields in the vnode
421 * - Hold the parent
423 struct vnode *
424 gfs_file_create(size_t size, struct vnode *pvp, const struct vnodeops *ops)
426 gfs_file_t *fp;
427 struct vnode *vp;
430 * Allocate vnode and internal data structure
432 fp = kmem_zalloc(size, KM_SLEEP);
433 vp = vn_alloc(KM_SLEEP);
436 * Set up various pointers
438 fp->gfs_vnode = vp;
439 fp->gfs_parent = pvp;
440 vp->v_data = fp;
441 fp->gfs_size = size;
442 fp->gfs_type = GFS_FILE;
445 * Initialize vnode and hold parent.
447 vn_setops(vp, ops);
448 if (pvp) {
449 VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
450 VN_HOLD(pvp);
453 return (vp);
457 * gfs_dir_create: creates a new directory in the parent
459 * size - size of private data structure (v_data)
460 * pvp - parent vnode (GFS directory)
461 * ops - vnode operations vector
462 * entries - NULL-terminated list of static entries (if any)
463 * maxlen - maximum length of a directory entry
464 * readdir_cb - readdir callback (see gfs_dir_readdir)
465 * inode_cb - inode callback (see gfs_dir_readdir)
466 * lookup_cb - lookup callback (see gfs_dir_lookup)
468 * In order to use this function, the first member of the private vnode
469 * structure (v_data) must be a gfs_dir_t. For each directory, there are
470 * static entries, defined when the structure is initialized, and dynamic
471 * entries, retrieved through callbacks.
473 * If a directory has static entries, then it must supply a inode callback,
474 * which will compute the inode number based on the parent and the index.
475 * For a directory with dynamic entries, the caller must supply a readdir
476 * callback and a lookup callback. If a static lookup fails, we fall back to
477 * the supplied lookup callback, if any.
479 * This function also performs the same initialization as gfs_file_create().
481 struct vnode *
482 gfs_dir_create(size_t struct_size, struct vnode *pvp,
483 const struct vnodeops *ops,
484 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
485 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
487 vnode_t *vp;
488 gfs_dir_t *dp;
489 gfs_dirent_t *de;
491 vp = gfs_file_create(struct_size, pvp, ops);
492 vp->v_type = VDIR;
494 dp = vp->v_data;
495 dp->gfsd_file.gfs_type = GFS_DIR;
496 dp->gfsd_maxlen = maxlen;
498 if (entries != NULL) {
499 for (de = entries; de->gfse_name != NULL; de++)
500 dp->gfsd_nstatic++;
502 dp->gfsd_static = kmem_alloc(
503 dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
504 bcopy(entries, dp->gfsd_static,
505 dp->gfsd_nstatic * sizeof (gfs_dirent_t));
508 dp->gfsd_readdir = readdir_cb;
509 dp->gfsd_lookup = lookup_cb;
510 dp->gfsd_inode = inode_cb;
512 mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
514 return (vp);
518 * gfs_root_create(): create a root vnode for a GFS filesystem
520 * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The
521 * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
523 struct vnode *
524 gfs_root_create(size_t size, struct vfs *vfsp,
525 const struct vnodeops *ops, ino64_t ino,
526 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
527 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
529 struct vnode *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
530 maxlen, readdir_cb, lookup_cb);
532 /* Manually set the inode */
533 ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
535 VFS_HOLD(vfsp);
536 VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
537 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
539 return (vp);
543 * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
545 * Similar to gfs_root_create(), this creates a root vnode for a file to
546 * be the pseudo-filesystem.
548 struct vnode *
549 gfs_root_create_file(size_t size, struct vfs *vfsp,
550 const struct vnodeops *ops, ino64_t ino)
552 struct vnode *vp = gfs_file_create(size, NULL, ops);
554 ((gfs_file_t *)vp->v_data)->gfs_ino = ino;
556 VFS_HOLD(vfsp);
557 VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
558 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
560 return (vp);
564 * gfs_file_inactive()
566 * Called from the fop_inactive() routine. If necessary, this routine will
567 * remove the given vnode from the parent directory and clean up any references
568 * in the VFS layer.
570 * If the vnode was not removed (due to a race with vget), then NULL is
571 * returned. Otherwise, a pointer to the private data is returned.
573 void *
574 gfs_file_inactive(vnode_t *vp)
576 int i;
577 gfs_dirent_t *ge = NULL;
578 gfs_file_t *fp = vp->v_data;
579 gfs_dir_t *dp = NULL;
580 void *data;
582 if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
583 goto found;
585 dp = fp->gfs_parent->v_data;
588 * First, see if this vnode is cached in the parent.
590 gfs_dir_lock(dp);
593 * Find it in the set of static entries.
595 for (i = 0; i < dp->gfsd_nstatic; i++) {
596 ge = &dp->gfsd_static[i];
598 if (ge->gfse_vnode == vp)
599 goto found;
603 * If 'ge' is NULL, then it is a dynamic entry.
605 ge = NULL;
607 found:
608 if (vp->v_flag & V_XATTRDIR) {
609 mutex_enter(&fp->gfs_parent->v_lock);
611 mutex_enter(&vp->v_lock);
612 if (vp->v_count == 1) {
614 * Really remove this vnode
616 data = vp->v_data;
617 if (ge != NULL) {
619 * If this was a statically cached entry, simply set the
620 * cached vnode to NULL.
622 ge->gfse_vnode = NULL;
624 if (vp->v_flag & V_XATTRDIR) {
625 fp->gfs_parent->v_xattrdir = NULL;
626 mutex_exit(&fp->gfs_parent->v_lock);
628 mutex_exit(&vp->v_lock);
631 * Free vnode and release parent
633 if (fp->gfs_parent) {
634 if (dp) {
635 gfs_dir_unlock(dp);
637 VN_RELE(fp->gfs_parent);
638 } else {
639 ASSERT(vp->v_vfsp != NULL);
640 VFS_RELE(vp->v_vfsp);
642 vn_free(vp);
643 } else {
644 VN_RELE_LOCKED(vp);
645 data = NULL;
646 mutex_exit(&vp->v_lock);
647 if (vp->v_flag & V_XATTRDIR) {
648 mutex_exit(&fp->gfs_parent->v_lock);
650 if (dp)
651 gfs_dir_unlock(dp);
654 return (data);
658 * gfs_dir_inactive()
660 * Same as above, but for directories.
662 void *
663 gfs_dir_inactive(vnode_t *vp)
665 gfs_dir_t *dp;
667 ASSERT(vp->v_type == VDIR);
669 if ((dp = gfs_file_inactive(vp)) != NULL) {
670 mutex_destroy(&dp->gfsd_lock);
671 if (dp->gfsd_nstatic)
672 kmem_free(dp->gfsd_static,
673 dp->gfsd_nstatic * sizeof (gfs_dirent_t));
676 return (dp);
680 * gfs_dir_lookup_dynamic()
682 * This routine looks up the provided name amongst the dynamic entries
683 * in the gfs directory and returns the corresponding vnode, if found.
685 * The gfs directory is expected to be locked by the caller prior to
686 * calling this function. The directory will be unlocked during the
687 * execution of this function, but will be locked upon return from the
688 * function. This function returns 0 on success, non-zero on error.
690 * The dynamic lookups are performed by invoking the lookup
691 * callback, which is passed to this function as the first argument.
692 * The arguments to the callback are:
694 * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
695 * int flags, int *deflgs, pathname_t *rpnp);
697 * pvp - parent vnode
698 * nm - name of entry
699 * vpp - pointer to resulting vnode
700 * cr - pointer to cred
701 * flags - flags value from lookup request
702 * ignored here; currently only used to request
703 * insensitive lookups
704 * direntflgs - output parameter, directory entry flags
705 * ignored here; currently only used to indicate a lookup
706 * has more than one possible match when case is not considered
707 * realpnp - output parameter, real pathname
708 * ignored here; when lookup was performed case-insensitively,
709 * this field contains the "real" name of the file.
711 * Returns 0 on success, non-zero on error.
713 static int
714 gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
715 const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
716 int *direntflags, pathname_t *realpnp)
718 gfs_file_t *fp;
719 ino64_t ino;
720 int ret;
722 ASSERT(GFS_DIR_LOCKED(dp));
725 * Drop the directory lock, as the lookup routine
726 * will need to allocate memory, or otherwise deadlock on this
727 * directory.
729 gfs_dir_unlock(dp);
730 ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
731 gfs_dir_lock(dp);
734 * The callback for extended attributes returns a vnode
735 * with v_data from an underlying fs.
737 if (ret == 0 && !IS_XATTRDIR(dvp)) {
738 fp = (gfs_file_t *)((*vpp)->v_data);
739 fp->gfs_index = -1;
740 fp->gfs_ino = ino;
743 return (ret);
747 * gfs_dir_lookup_static()
749 * This routine looks up the provided name amongst the static entries
750 * in the gfs directory and returns the corresponding vnode, if found.
751 * The first argument to the function is a pointer to the comparison
752 * function this function should use to decide if names are a match.
754 * If a match is found, and GFS_CACHE_VNODE is set and the vnode
755 * exists, we simply return the existing vnode. Otherwise, we call
756 * the static entry's callback routine, caching the result if
757 * necessary. If the idx pointer argument is non-NULL, we use it to
758 * return the index of the matching static entry.
760 * The gfs directory is expected to be locked by the caller prior to calling
761 * this function. The directory may be unlocked during the execution of
762 * this function, but will be locked upon return from the function.
764 * This function returns 0 if a match is found, ENOENT if not.
766 static int
767 gfs_dir_lookup_static(int (*compare)(const char *, const char *),
768 gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
769 vnode_t **vpp, pathname_t *rpnp)
771 gfs_dirent_t *ge;
772 vnode_t *vp = NULL;
773 int i;
775 ASSERT(GFS_DIR_LOCKED(dp));
778 * Search static entries.
780 for (i = 0; i < dp->gfsd_nstatic; i++) {
781 ge = &dp->gfsd_static[i];
783 if (compare(ge->gfse_name, nm) == 0) {
784 if (rpnp)
785 (void) strlcpy(rpnp->pn_buf, ge->gfse_name,
786 rpnp->pn_bufsize);
788 if (ge->gfse_vnode) {
789 ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
790 vp = ge->gfse_vnode;
791 VN_HOLD(vp);
792 break;
796 * We drop the directory lock, as the constructor will
797 * need to do KM_SLEEP allocations. If we return from
798 * the constructor only to find that a parallel
799 * operation has completed, and GFS_CACHE_VNODE is set
800 * for this entry, we discard the result in favor of
801 * the cached vnode.
803 gfs_dir_unlock(dp);
804 vp = ge->gfse_ctor(dvp);
805 gfs_dir_lock(dp);
807 ((gfs_file_t *)vp->v_data)->gfs_index = i;
809 /* Set the inode according to the callback. */
810 ((gfs_file_t *)vp->v_data)->gfs_ino =
811 dp->gfsd_inode(dvp, i);
813 if (ge->gfse_flags & GFS_CACHE_VNODE) {
814 if (ge->gfse_vnode == NULL) {
815 ge->gfse_vnode = vp;
816 } else {
818 * A parallel constructor beat us to it;
819 * return existing vnode. We have to be
820 * careful because we can't release the
821 * current vnode while holding the
822 * directory lock; its inactive routine
823 * will try to lock this directory.
825 vnode_t *oldvp = vp;
826 vp = ge->gfse_vnode;
827 VN_HOLD(vp);
829 gfs_dir_unlock(dp);
830 VN_RELE(oldvp);
831 gfs_dir_lock(dp);
834 break;
838 if (vp == NULL)
839 return (ENOENT);
840 else if (idx)
841 *idx = i;
842 *vpp = vp;
843 return (0);
847 * gfs_dir_lookup()
849 * Looks up the given name in the directory and returns the corresponding
850 * vnode, if found.
852 * First, we search statically defined entries, if any, with a call to
853 * gfs_dir_lookup_static(). If no static entry is found, and we have
854 * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
856 * This function returns 0 on success, non-zero on error.
859 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
860 int flags, int *direntflags, pathname_t *realpnp)
862 gfs_dir_t *dp = dvp->v_data;
863 boolean_t casecheck;
864 vnode_t *dynvp = NULL;
865 vnode_t *vp = NULL;
866 int (*compare)(const char *, const char *);
867 int error, idx;
869 ASSERT(dvp->v_type == VDIR);
871 if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
872 return (0);
874 casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
875 if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
876 (flags & FIGNORECASE))
877 compare = strcasecmp;
878 else
879 compare = strcmp;
881 gfs_dir_lock(dp);
883 error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
885 if (vp && casecheck) {
886 gfs_dirent_t *ge;
887 int i;
889 for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
890 ge = &dp->gfsd_static[i];
892 if (strcasecmp(ge->gfse_name, nm) == 0) {
893 *direntflags |= ED_CASE_CONFLICT;
894 goto out;
899 if ((error || casecheck) && dp->gfsd_lookup)
900 error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
901 &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
903 if (vp && dynvp) {
904 /* static and dynamic entries are case-insensitive conflict */
905 ASSERT(casecheck);
906 *direntflags |= ED_CASE_CONFLICT;
907 VN_RELE(dynvp);
908 } else if (vp == NULL) {
909 vp = dynvp;
910 } else if (error == ENOENT) {
911 error = 0;
912 } else if (error) {
913 VN_RELE(vp);
914 vp = NULL;
917 out:
918 gfs_dir_unlock(dp);
920 *vpp = vp;
921 return (error);
925 * gfs_dir_readdir: does a readdir() on the given directory
927 * dvp - directory vnode
928 * uiop - uio structure
929 * eofp - eof pointer
930 * data - arbitrary data passed to readdir callback
932 * This routine does all the readdir() dirty work. Even so, the caller must
933 * supply two callbacks in order to get full compatibility.
935 * If the directory contains static entries, an inode callback must be
936 * specified. This avoids having to create every vnode and call fop_getattr()
937 * when reading the directory. This function has the following arguments:
939 * ino_t gfs_inode_cb(vnode_t *vp, int index);
941 * vp - vnode for the directory
942 * index - index in original gfs_dirent_t array
944 * Returns the inode number for the given entry.
946 * For directories with dynamic entries, a readdir callback must be provided.
947 * This is significantly more complex, thanks to the particulars of
948 * fop_readdir().
950 * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
951 * offset_t *off, offset_t *nextoff, void *data, int flags)
953 * vp - directory vnode
954 * dp - directory entry, sized according to maxlen given to
955 * gfs_dir_create(). callback must fill in d_name and
956 * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
957 * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
958 * is set in 'flags'.
959 * eofp - callback must set to 1 when EOF has been reached
960 * off - on entry, the last offset read from the directory. Callback
961 * must set to the offset of the current entry, typically left
962 * untouched.
963 * nextoff - callback must set to offset of next entry. Typically
964 * (off + 1)
965 * data - caller-supplied data
966 * flags - fop_readdir flags
968 * Return 0 on success, or error on failure.
971 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
972 caller_context_t *ct, int flags)
974 gfs_readdir_state_t gstate;
975 int error, eof = 0;
976 ino64_t ino, pino;
977 offset_t off, next;
978 gfs_dir_t *dp = dvp->v_data;
980 error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
981 if (error)
982 return (error);
984 if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
985 pino, ino, flags)) != 0)
986 return (error);
988 while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
989 !eof) {
991 if (off >= 0 && off < dp->gfsd_nstatic) {
992 ino = dp->gfsd_inode(dvp, off);
994 if ((error = gfs_readdir_emit(&gstate, uiop,
995 off, ino, dp->gfsd_static[off].gfse_name, 0))
996 != 0)
997 break;
999 } else if (dp->gfsd_readdir) {
1000 off -= dp->gfsd_nstatic;
1002 if ((error = dp->gfsd_readdir(dvp,
1003 gstate.grd_dirent, &eof, &off, &next,
1004 data, flags)) != 0 || eof)
1005 break;
1007 off += dp->gfsd_nstatic + 2;
1008 next += dp->gfsd_nstatic + 2;
1010 if ((error = gfs_readdir_emit_int(&gstate, uiop,
1011 next)) != 0)
1012 break;
1013 } else {
1015 * Offset is beyond the end of the static entries, and
1016 * we have no dynamic entries. Set EOF.
1018 eof = 1;
1022 return (gfs_readdir_fini(&gstate, error, eofp, eof));
1027 * gfs_vop_lookup: fop_lookup() entry point
1029 * For use directly in vnode ops table. Given a GFS directory, calls
1030 * gfs_dir_lookup() as necessary.
1032 /* ARGSUSED */
1034 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
1035 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1036 int *direntflags, pathname_t *realpnp)
1038 return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
1042 * gfs_vop_readdir: fop_readdir() entry point
1044 * For use directly in vnode ops table. Given a GFS directory, calls
1045 * gfs_dir_readdir() as necessary.
1047 /* ARGSUSED */
1049 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
1050 caller_context_t *ct, int flags)
1052 return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
1057 * gfs_vop_map: fop_map() entry point
1059 * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
1060 * This function only works for readonly files, and uses the read function for
1061 * the vnode to fill in the data. The mapped data is immediately faulted in and
1062 * filled with the necessary data during this call; there are no getpage() or
1063 * putpage() routines.
1065 /* ARGSUSED */
1067 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
1068 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
1069 caller_context_t *ct)
1071 int rv;
1072 ssize_t resid = len;
1075 * Check for bad parameters
1077 #ifdef _ILP32
1078 if (len > MAXOFF_T)
1079 return (ENOMEM);
1080 #endif
1081 if (vp->v_flag & VNOMAP)
1082 return (ENOTSUP);
1083 if (off > MAXOFF_T)
1084 return (EFBIG);
1085 if ((long)off < 0 || (long)(off + len) < 0)
1086 return (EINVAL);
1087 if (vp->v_type != VREG)
1088 return (ENODEV);
1089 if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
1090 return (EACCES);
1093 * Find appropriate address if needed, otherwise clear address range.
1095 as_rangelock(as);
1096 rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
1097 if (rv != 0) {
1098 as_rangeunlock(as);
1099 return (rv);
1103 * Create mapping
1105 rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
1106 as_rangeunlock(as);
1107 if (rv != 0)
1108 return (rv);
1111 * Fill with data from read()
1113 rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
1114 0, 0, cred, &resid);
1116 if (rv == 0 && resid != 0)
1117 rv = ENXIO;
1119 if (rv != 0) {
1120 as_rangelock(as);
1121 (void) as_unmap(as, *addrp, len);
1122 as_rangeunlock(as);
1125 return (rv);
1129 * gfs_vop_inactive: fop_inactive() entry point
1131 * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
1132 * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
1134 /* ARGSUSED */
1135 void
1136 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1138 gfs_file_t *fp = vp->v_data;
1139 void *data;
1141 if (fp->gfs_type == GFS_DIR)
1142 data = gfs_dir_inactive(vp);
1143 else
1144 data = gfs_file_inactive(vp);
1146 if (data != NULL)
1147 kmem_free(data, fp->gfs_size);