stop shipping useless ksh93 builtins into /usr/bin
[unleashed.git] / kernel / fs / ufs / ufs_dir.c
blobcbb986e93a09bb2e1c395f25902edfcf0b97a5e6
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
39 * Directory manipulation routines.
41 * When manipulating directories, the i_rwlock provides serialization
42 * since directories cannot be mmapped. The i_contents lock is redundant.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signal.h>
50 #include <sys/cred.h>
51 #include <sys/proc.h>
52 #include <sys/disp.h>
53 #include <sys/user.h>
54 #include <sys/vfs.h>
55 #include <sys/vnode.h>
56 #include <sys/stat.h>
57 #include <sys/mode.h>
58 #include <sys/buf.h>
59 #include <sys/uio.h>
60 #include <sys/dnlc.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/mount.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_quota.h>
68 #include <sys/errno.h>
69 #include <sys/debug.h>
70 #include <vm/seg.h>
71 #include <sys/sysmacros.h>
72 #include <sys/cmn_err.h>
73 #include <sys/cpuvar.h>
74 #include <sys/unistd.h>
75 #include <sys/policy.h>
78 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
80 #if !ISP2(DIRBLKSIZ)
81 #error "DIRBLKSIZ not a power of 2"
82 #endif
85 * A virgin directory.
87 static struct dirtemplate mastertemplate = {
88 0, 12, 1, ".",
89 0, DIRBLKSIZ - 12, 2, ".."
92 #define LDIRSIZ(len) \
93 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
94 #define MAX_DIR_NAME_LEN(len) \
95 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
98 * The dnlc directory cache allows a 64 bit handle for directory entries.
99 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
100 * into the handle. Note, a 32 bit offset allows a 4GB directory, which
101 * is way beyond what could be cached in memory by the directory
102 * caching routines. So we are quite safe with this limit.
103 * The macros below pack and unpack the handle.
105 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
106 #define H_TO_OFF(h) (off_t)((h) >> 32)
107 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
110 * The average size of a typical on disk directory entry is about 16 bytes
111 * and so defines AV_DIRECT_SHIFT : log2(16)
112 * This define is only used to approximate the number of entries
113 * is a directory. This is needed for dnlc_dir_start() which will immediately
114 * return an error if the value is not within its acceptable range of
115 * number of files in a directory.
117 #define AV_DIRECT_SHIFT 4
119 * If the directory size (from i_size) is greater than the ufs_min_dir_cache
120 * tunable then we request dnlc directory caching.
121 * This has found to be profitable after 1024 file names.
123 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;
125 /* The time point the dnlc directory caching was disabled */
126 static hrtime_t ufs_dc_disable_at;
127 /* directory caching disable duration */
128 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;
130 #ifdef DEBUG
131 int dirchk = 1;
132 #else /* !DEBUG */
133 int dirchk = 0;
134 #endif /* DEBUG */
135 int ufs_negative_cache = 1;
136 uint64_t ufs_dirremove_retry_cnt;
138 static void dirbad();
139 static int ufs_dirrename();
140 static int ufs_diraddentry();
141 static int ufs_dirempty();
142 static int ufs_dirscan();
143 static int ufs_dirclrdotdot();
144 static int ufs_dirfixdotdot();
145 static int ufs_dirpurgedotdot();
146 static int dirprepareentry();
147 static int ufs_dirmakedirect();
148 static int dirbadname();
149 static int dirmangled();
152 * Check accessibility of directory against inquired mode and type.
153 * Execute access is required to search the directory.
154 * Access for write is interpreted as allowing
155 * deletion of files in the directory.
156 * Note, the reader i_contents lock will be acquired in
157 * ufs_iaccess().
160 ufs_diraccess(struct inode *ip, int mode, struct cred *cr)
162 if (((ip->i_mode & IFMT) != IFDIR) &&
163 ((ip->i_mode & IFMT) != IFATTRDIR))
164 return (ENOTDIR);
166 return (ufs_iaccess(ip, mode, cr, 1));
170 * Look for a given name in a directory. On successful return, *ipp
171 * will point to the VN_HELD inode.
172 * The caller is responsible for checking accessibility upfront
173 * via ufs_diraccess().
176 ufs_dirlook(
177 struct inode *dp,
178 char *namep,
179 struct inode **ipp,
180 struct cred *cr,
181 int skipdnlc, /* skip the 1st level dnlc */
182 int skipcaching) /* force directory caching off */
184 uint64_t handle;
185 struct fbuf *fbp; /* a buffer of directory entries */
186 struct direct *ep; /* the current directory entry */
187 struct vnode *vp;
188 struct vnode *dvp; /* directory vnode ptr */
189 struct ulockfs *ulp;
190 dcanchor_t *dcap;
191 off_t endsearch; /* offset to end directory search */
192 off_t offset;
193 off_t start_off; /* starting offset from middle search */
194 off_t last_offset; /* last offset */
195 int entryoffsetinblock; /* offset of ep in addr's buffer */
196 int numdirpasses; /* strategy for directory search */
197 int namlen; /* length of name */
198 int err;
199 int doingchk;
200 int i;
201 int caching;
202 int indeadlock;
203 ino_t ep_ino; /* entry i number */
204 ino_t chkino;
205 ushort_t ep_reclen; /* direct local d_reclen */
207 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */
209 if (dp->i_ufsvfs)
210 ulp = &dp->i_ufsvfs->vfs_ulockfs;
213 * Check the directory name lookup cache, first for individual files
214 * then for complete directories.
216 dvp = ITOV(dp);
217 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
218 /* vp is already held from dnlc_lookup */
219 if (vp == DNLC_NO_VNODE) {
220 VN_RELE(vp);
221 return (ENOENT);
223 *ipp = VTOI(vp);
224 return (0);
227 dcap = &dp->i_danchor;
230 * Grab the reader lock on the directory data before checking
231 * the dnlc to avoid a race with ufs_dirremove() & friends.
233 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
234 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
235 * possible, retries the operation.
237 indeadlock = ufs_tryirwlock(ulp, &dp->i_rwlock, RW_READER);
238 if (indeadlock)
239 return (EAGAIN);
241 switch (dnlc_dir_lookup(dcap, namep, &handle)) {
242 case DFOUND:
243 ep_ino = (ino_t)H_TO_INO(handle);
244 if (dp->i_number == ep_ino) {
245 VN_HOLD(dvp); /* want ourself, "." */
246 *ipp = dp;
247 rw_exit(&dp->i_rwlock);
248 return (0);
250 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
251 uint64_t handle2;
253 * release the lock on the dir we are searching
254 * to avoid a deadlock when grabbing the
255 * i_contents lock in ufs_iget_alloced().
257 rw_exit(&dp->i_rwlock);
258 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
259 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
260 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
262 * must recheck as we dropped dp->i_rwlock
264 indeadlock = ufs_tryirwlock(ulp, &dp->i_rwlock,
265 RW_READER);
266 if (indeadlock) {
267 if (!err)
268 VN_RELE(ITOV(*ipp));
269 return (EAGAIN);
271 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
272 == DFOUND) && (handle == handle2)) {
273 dnlc_update(dvp, namep, ITOV(*ipp));
274 rw_exit(&dp->i_rwlock);
275 return (0);
277 /* check failed, read the actual directory */
278 if (!err) {
279 VN_RELE(ITOV(*ipp));
281 goto restart;
283 /* usual case of not "." nor ".." */
284 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
285 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
286 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
287 if (err) {
288 rw_exit(&dp->i_rwlock);
289 return (err);
291 dnlc_update(dvp, namep, ITOV(*ipp));
292 rw_exit(&dp->i_rwlock);
293 return (0);
294 case DNOENT:
295 if (ufs_negative_cache && (dp->i_nlink > 0)) {
296 dnlc_enter(dvp, namep, DNLC_NO_VNODE);
298 rw_exit(&dp->i_rwlock);
299 return (ENOENT);
300 default:
301 break;
303 restart:
305 fbp = NULL;
306 doingchk = 0;
307 chkino = 0;
308 caching = 0;
311 * Attempt to cache any directories greater than the tunable
312 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
313 * disable caching for this directory and record the system time.
314 * Any attempt after the disable time has expired will enable
315 * the caching again.
317 if (!skipcaching && (dp->i_size >= ufs_min_dir_cache)) {
319 * if the directory caching disable time has expired
320 * enable the caching again.
322 if (dp->i_cachedir == CD_DISABLED_NOMEM &&
323 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
324 ufs_dc_disable_at = 0;
325 dp->i_cachedir = CD_ENABLED;
327 if (dp->i_cachedir == CD_ENABLED) {
328 switch (dnlc_dir_start(dcap, dp->i_size >>
329 AV_DIRECT_SHIFT)) {
330 case DNOMEM:
331 dp->i_cachedir = CD_DISABLED_NOMEM;
332 ufs_dc_disable_at = gethrtime();
333 break;
334 case DTOOBIG:
335 dp->i_cachedir = CD_DISABLED_TOOBIG;
336 break;
337 case DOK:
338 caching = 1;
339 break;
340 default:
341 break;
346 * If caching we don't stop when the file has been
347 * found, but need to know later, so clear *ipp now
349 *ipp = NULL;
351 recheck:
352 if (caching) {
353 offset = 0;
354 entryoffsetinblock = 0;
355 numdirpasses = 1;
356 } else {
358 * Take care to look at dp->i_diroff only once, as it
359 * may be changing due to other threads/cpus.
361 offset = dp->i_diroff;
362 if (offset > dp->i_size) {
363 offset = 0;
365 if (offset == 0) {
366 entryoffsetinblock = 0;
367 numdirpasses = 1;
368 } else {
369 start_off = offset;
371 entryoffsetinblock = blkoff(dp->i_fs, offset);
372 if (entryoffsetinblock != 0) {
373 err = blkatoff(dp, offset, (char **)0, &fbp);
374 if (err)
375 goto bad;
377 numdirpasses = 2;
380 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, uoff_t);
381 namlen = strlen(namep);
382 last_offset = 0;
384 searchloop:
385 while (offset < endsearch) {
387 * If offset is on a block boundary,
388 * read the next directory block.
389 * Release previous if it exists.
391 if (blkoff(dp->i_fs, offset) == 0) {
392 if (fbp != NULL) {
393 fbrelse(fbp, S_OTHER);
395 err = blkatoff(dp, offset, (char **)0, &fbp);
396 if (err)
397 goto bad;
398 entryoffsetinblock = 0;
402 * If the offset to the next entry is invalid or if the
403 * next entry is a zero length record or if the record
404 * length is invalid, then skip to the next directory
405 * block. Complete validation checks are done if the
406 * record length is invalid.
408 * Full validation checks are slow so they are disabled
409 * by default. Complete checks can be run by patching
410 * "dirchk" to be true.
412 * We have to check the validity of entryoffsetinblock
413 * here because it can be set to i_diroff above.
415 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
416 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
417 (dirchk || (ep->d_reclen & 0x3)) &&
418 dirmangled(dp, ep, entryoffsetinblock, offset)) {
419 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
420 offset += i;
421 entryoffsetinblock += i;
422 if (caching) {
423 dnlc_dir_purge(dcap);
424 caching = 0;
426 continue;
429 ep_reclen = ep->d_reclen;
432 * Add named entries and free space into the directory cache
434 if (caching) {
435 ushort_t extra;
436 off_t off2;
438 if (ep->d_ino == 0) {
439 extra = ep_reclen;
440 if (offset & (DIRBLKSIZ - 1)) {
441 dnlc_dir_purge(dcap);
442 dp->i_cachedir = CD_DISABLED;
443 caching = 0;
445 } else {
447 * entries hold the previous offset except the
448 * 1st which holds the offset + 1
450 if (offset & (DIRBLKSIZ - 1)) {
451 off2 = last_offset;
452 } else {
453 off2 = offset + 1;
455 caching = (dnlc_dir_add_entry(dcap, ep->d_name,
456 INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
457 extra = ep_reclen - DIRSIZ(ep);
459 if (caching && (extra >= LDIRSIZ(1))) {
460 caching = (dnlc_dir_add_space(dcap, extra,
461 (uint64_t)offset) == DOK);
466 * Check for a name match.
467 * We have the parent inode read locked with i_rwlock.
469 if (ep->d_ino && ep->d_namlen == namlen &&
470 *namep == *ep->d_name && /* fast chk 1st chr */
471 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {
474 * We have to release the fbp early here to avoid
475 * a possible deadlock situation where we have the
476 * fbp and want the directory inode and someone doing
477 * a ufs_direnter_* has the directory inode and wants
478 * the fbp. XXX - is this still needed?
480 ep_ino = (ino_t)ep->d_ino;
481 ASSERT(fbp != NULL);
482 fbrelse(fbp, S_OTHER);
483 fbp = NULL;
486 * Atomic update (read lock held)
488 dp->i_diroff = offset;
490 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
491 struct timeval32 omtime;
493 if (caching) {
494 dnlc_dir_purge(dcap);
495 caching = 0;
497 if (doingchk) {
499 * if the inumber didn't change
500 * continue with already found inode.
502 if (ep_ino == chkino)
503 goto checkok;
504 else {
505 VN_RELE(ITOV(*ipp));
506 /* *ipp is nulled at restart */
507 goto restart;
511 * release the lock on the dir we are searching
512 * to avoid a deadlock when grabbing the
513 * i_contents lock in ufs_iget_alloced().
515 omtime = dp->i_mtime;
516 rw_exit(&dp->i_rwlock);
517 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
518 RW_READER);
519 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
520 cr);
521 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
522 indeadlock = ufs_tryirwlock(ulp, &dp->i_rwlock,
523 RW_READER);
524 if (indeadlock) {
525 if (!err)
526 VN_RELE(ITOV(*ipp));
527 return (EAGAIN);
529 if (err)
530 goto bad;
532 * Since we released the lock on the directory,
533 * we must check that the same inode is still
534 * the ".." entry for this directory.
536 /*CSTYLED*/
537 if (timercmp(&omtime, &dp->i_mtime, !=)) {
539 * Modification time changed on the
540 * directory, we must go check if
541 * the inumber changed for ".."
543 doingchk = 1;
544 chkino = ep_ino;
545 entryoffsetinblock = 0;
546 if (caching) {
548 * Forget directory caching
549 * for this rare case
551 dnlc_dir_purge(dcap);
552 caching = 0;
554 goto recheck;
556 } else if (dp->i_number == ep_ino) {
557 VN_HOLD(dvp); /* want ourself, "." */
558 *ipp = dp;
559 if (caching) {
560 dnlc_dir_purge(dcap);
561 caching = 0;
563 } else {
564 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
565 RW_READER);
566 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
567 cr);
568 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
569 if (err)
570 goto bad;
572 checkok:
573 ASSERT(*ipp);
574 dnlc_update(dvp, namep, ITOV(*ipp));
576 * If we are not caching then just return the entry
577 * otherwise complete loading up the cache
579 if (!caching) {
580 rw_exit(&dp->i_rwlock);
581 return (0);
583 err = blkatoff(dp, offset, (char **)0, &fbp);
584 if (err)
585 goto bad;
587 last_offset = offset;
588 offset += ep_reclen;
589 entryoffsetinblock += ep_reclen;
592 * If we started in the middle of the directory and failed
593 * to find our target, we must check the beginning as well.
595 if (numdirpasses == 2) {
596 numdirpasses--;
597 offset = 0;
598 endsearch = start_off;
599 goto searchloop;
603 * If whole directory caching is on (or was originally on) then
604 * the entry may have been found.
606 if (*ipp == NULL) {
607 err = ENOENT;
608 if (ufs_negative_cache && (dp->i_nlink > 0)) {
609 dnlc_enter(dvp, namep, DNLC_NO_VNODE);
612 if (caching) {
613 dnlc_dir_complete(dcap);
614 caching = 0;
617 bad:
618 if (err && *ipp) {
620 * err and *ipp can both be set if we were attempting to
621 * cache the directory, and we found the entry, then later
622 * while trying to complete the directory cache encountered
623 * a error (eg reading a directory sector).
625 VN_RELE(ITOV(*ipp));
626 *ipp = NULL;
629 if (fbp)
630 fbrelse(fbp, S_OTHER);
631 rw_exit(&dp->i_rwlock);
632 if (caching)
633 dnlc_dir_purge(dcap);
634 return (err);
638 * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
641 ufs_direnter_cm(
642 struct inode *tdp, /* target directory to make entry in */
643 char *namep, /* name of entry */
644 enum de_op op, /* entry operation */
645 struct vattr *vap, /* attributes if new inode needed */
646 struct inode **ipp, /* return entered inode here */
647 struct cred *cr, /* user credentials */
648 int flags) /* no entry exists */
650 struct inode *tip; /* inode of (existing) target file */
651 char *s;
652 struct ufs_slot slot; /* slot info to pass around */
653 int namlen; /* length of name */
654 int err; /* error number */
655 struct inode *nip; /* new inode */
656 int do_rele_nip = 0; /* release nip */
657 int noentry = flags & ~IQUIET;
658 int quiet = flags & IQUIET; /* Suppress out of inodes message */
659 int indeadlock;
660 struct ulockfs *ulp;
662 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
664 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
665 ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
666 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
667 (vap->va_type == VFIFO))))
668 return (EINVAL);
670 /* don't allow '/' characters in pathname component */
671 for (s = namep, namlen = 0; *s; s++, namlen++)
672 if (*s == '/')
673 return (EACCES);
674 ASSERT(namlen);
677 * Check accessibility of target directory.
679 if (err = ufs_diraccess(tdp, IEXEC, cr))
680 return (err);
683 * If name is "." or ".." then if this is a create look it up
684 * and return EEXIST.
686 if (namep[0] == '.' &&
687 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
689 * ufs_dirlook will acquire the i_rwlock
691 if (tdp->i_ufsvfs)
692 ulp = &tdp->i_ufsvfs->vfs_ulockfs;
693 rw_exit(&tdp->i_rwlock);
694 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0, 0)) {
695 if (err == EAGAIN)
696 return (err);
699 * ufs_tryirwlock uses rw_tryenter and checks for
700 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
701 * If deadlock possible, retries the operation.
703 indeadlock = ufs_tryirwlock(ulp, &tdp->i_rwlock,
704 RW_WRITER);
705 if (indeadlock)
706 return (EAGAIN);
708 return (err);
710 indeadlock = ufs_tryirwlock(ulp, &tdp->i_rwlock, RW_WRITER);
711 if (indeadlock) {
712 VN_RELE(ITOV(*ipp));
713 return (EAGAIN);
715 return (EEXIST);
719 * If target directory has not been removed, then we can consider
720 * allowing file to be created.
722 if (tdp->i_nlink <= 0) {
723 return (ENOENT);
727 * Search for the entry. Return VN_HELD tip if found.
729 tip = NULL;
730 slot.fbp = NULL;
731 slot.status = NONE;
732 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
733 rw_enter(&tdp->i_contents, RW_WRITER);
734 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
735 if (err)
736 goto out;
737 if (tip) {
738 ASSERT(!noentry);
739 *ipp = tip;
740 err = EEXIST;
741 } else {
743 * The entry does not exist. Check write permission in
744 * directory to see if entry can be created.
746 if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
747 goto out;
749 * Make new inode and directory entry.
751 tdp->i_flag |= quiet;
752 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
753 if (nip != NULL)
754 do_rele_nip = 1;
755 goto out;
757 if (err = ufs_diraddentry(tdp, namep, op,
758 namlen, &slot, nip, NULL, cr)) {
760 * Unmake the inode we just made.
762 rw_enter(&nip->i_contents, RW_WRITER);
763 if (((nip->i_mode & IFMT) == IFDIR) ||
764 ((nip->i_mode & IFMT) == IFATTRDIR)) {
765 tdp->i_nlink--;
766 ufs_setreclaim(tdp);
767 tdp->i_flag |= ICHG;
768 tdp->i_seq++;
769 TRANS_INODE(tdp->i_ufsvfs, tdp);
770 ITIMES_NOLOCK(tdp);
772 nip->i_nlink = 0;
773 ufs_setreclaim(nip);
774 TRANS_INODE(nip->i_ufsvfs, nip);
775 nip->i_flag |= ICHG;
776 nip->i_seq++;
777 ITIMES_NOLOCK(nip);
778 rw_exit(&nip->i_contents);
779 do_rele_nip = 1;
780 } else {
781 *ipp = nip;
785 out:
786 if (slot.fbp)
787 fbrelse(slot.fbp, S_OTHER);
789 tdp->i_flag &= ~quiet;
790 rw_exit(&tdp->i_contents);
793 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
794 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
796 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
798 if (do_rele_nip) {
799 VN_RELE(ITOV(nip));
802 return (err);
806 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
809 ufs_direnter_lr(
810 struct inode *tdp, /* target directory to make entry in */
811 char *namep, /* name of entry */
812 enum de_op op, /* entry operation */
813 struct inode *sdp, /* source inode parent if rename */
814 struct inode *sip, /* source inode */
815 struct cred *cr) /* user credentials */
817 struct inode *tip; /* inode of (existing) target file */
818 char *s;
819 struct ufs_slot slot; /* slot info to pass around */
820 int namlen; /* length of name */
821 int err; /* error number */
823 /* don't allow '/' characters in pathname component */
824 for (s = namep, namlen = 0; *s; s++, namlen++)
825 if (*s == '/')
826 return (EACCES);
827 ASSERT(namlen);
828 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
831 * If name is "." or ".." then if this is a create look it up
832 * and return EEXIST. Rename or link TO "." or ".." is forbidden.
834 if (namep[0] == '.' &&
835 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
836 if (op == DE_RENAME) {
837 return (EINVAL); /* *SIGH* should be ENOTEMPTY */
839 return (EEXIST);
842 * For link and rename lock the source entry and check the link count
843 * to see if it has been removed while it was unlocked. If not, we
844 * increment the link count and force the inode to disk to make sure
845 * that it is there before any directory entry that points to it.
847 * In the case of a symbolic link, we are dealing with a new inode
848 * which does not yet have any links. We've created it with a link
849 * count of 1, and we don't want to increment it since this will be
850 * its first link.
852 * We are about to push the inode to disk. We make sure
853 * that the inode's data blocks are flushed first so the
854 * inode and it's data blocks are always in sync. This
855 * adds some robustness in in the event of a power failure
856 * or panic where sync fails. If we panic before the
857 * inode is updated, then the inode still refers to the
858 * old data blocks (or none for a new file). If we panic
859 * after the inode is updated, then the inode refers to
860 * the new data blocks.
862 * We do this before grabbing the i_contents lock because
863 * ufs_syncip() will want that lock. We could do the data
864 * syncing after the removal checks, but upon return from
865 * the data sync we would have to repeat the removal
866 * checks.
868 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
869 return (err);
872 rw_enter(&sip->i_contents, RW_WRITER);
873 if (sip->i_nlink <= 0) {
874 rw_exit(&sip->i_contents);
875 return (ENOENT);
877 if (sip->i_nlink == MAXLINK) {
878 rw_exit(&sip->i_contents);
879 return (EMLINK);
883 * Sync the indirect blocks associated with the file
884 * for the same reasons as described above. Since this
885 * call wants the i_contents lock held for it we can do
886 * this here with no extra work.
888 if (err = ufs_sync_indir(sip)) {
889 rw_exit(&sip->i_contents);
890 return (err);
893 if (op != DE_SYMLINK)
894 sip->i_nlink++;
895 TRANS_INODE(sip->i_ufsvfs, sip);
896 sip->i_flag |= ICHG;
897 sip->i_seq++;
898 ufs_iupdat(sip, I_SYNC);
899 rw_exit(&sip->i_contents);
902 * If target directory has not been removed, then we can consider
903 * allowing file to be created.
905 if (tdp->i_nlink <= 0) {
906 err = ENOENT;
907 goto out2;
911 * Check accessibility of target directory.
913 if (err = ufs_diraccess(tdp, IEXEC, cr))
914 goto out2;
917 * Search for the entry. Return VN_HELD tip if found.
919 tip = NULL;
920 slot.status = NONE;
921 slot.fbp = NULL;
922 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
923 rw_enter(&tdp->i_contents, RW_WRITER);
924 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
925 if (err)
926 goto out;
928 if (tip) {
929 switch (op) {
930 case DE_RENAME:
931 err = ufs_dirrename(sdp, sip, tdp, namep,
932 tip, &slot, cr);
933 break;
935 case DE_LINK:
936 case DE_SYMLINK:
938 * Can't link to an existing file.
940 err = EEXIST;
941 break;
942 default:
943 break;
945 } else {
947 * The entry does not exist. Check write permission in
948 * directory to see if entry can be created.
950 if (err = ufs_iaccess(tdp, IWRITE, cr, 0))
951 goto out;
952 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
953 cr);
956 out:
957 if (slot.fbp)
958 fbrelse(slot.fbp, S_OTHER);
960 rw_exit(&tdp->i_contents);
963 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
964 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
966 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);
969 * If we renamed a file over the top of an existing file,
970 * or linked a file to an existing file (or tried to),
971 * then release and delete (or just release) the inode.
973 if (tip)
974 VN_RELE(ITOV(tip));
976 out2:
977 if (err) {
979 * Undo bumped link count.
981 if (op != DE_SYMLINK) {
982 rw_enter(&sip->i_contents, RW_WRITER);
983 sip->i_nlink--;
984 ufs_setreclaim(sip);
985 TRANS_INODE(sip->i_ufsvfs, sip);
986 sip->i_flag |= ICHG;
987 sip->i_seq++;
988 ITIMES_NOLOCK(sip);
989 rw_exit(&sip->i_contents);
992 return (err);
996 * Check for the existence of a name in a directory (unless noentry
997 * is set) , or else of an empty
998 * slot in which an entry may be made. If the requested name is found,
999 * then on return *ipp points at the inode and *offp contains
1000 * its offset in the directory. If the name is not found, then *ipp
1001 * will be NULL and *slotp will contain information about a directory slot in
1002 * which an entry may be made (either an empty slot, or the first position
1003 * past the end of the directory).
1004 * The target directory inode (tdp) is supplied write locked (i_rwlock).
1006 * This may not be used on "." or "..", but aliases of "." are ok.
1009 ufs_dircheckforname(
1010 struct inode *tdp, /* inode of directory being checked */
1011 char *namep, /* name we're checking for */
1012 int namlen, /* length of name, excluding null */
1013 struct ufs_slot *slotp, /* slot structure */
1014 struct inode **ipp, /* return inode if we find one */
1015 struct cred *cr,
1016 int noentry) /* noentry - just look for space */
1018 uint64_t handle;
1019 struct fbuf *fbp; /* pointer to directory block */
1020 struct direct *ep; /* directory entry */
1021 struct direct *nep; /* next directory entry */
1022 dcanchor_t *dcap;
1023 vnode_t *dvp; /* directory vnode ptr */
1024 off_t dirsize; /* size of the directory */
1025 off_t offset; /* offset in the directory */
1026 off_t last_offset; /* last offset */
1027 off_t enduseful; /* pointer past last used dir slot */
1028 int entryoffsetinblk; /* offset of ep in fbp's buffer */
1029 int i; /* length of mangled entry */
1030 int needed;
1031 int err;
1032 int first;
1033 int caching;
1034 int stat;
1035 ino_t ep_ino;
1036 slotstat_t initstat = slotp->status;
1038 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1039 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1040 ASSERT(*ipp == NULL);
1041 fbp = NULL;
1044 * First check if there is a complete cache of the directory.
1046 dvp = ITOV(tdp);
1048 dcap = &tdp->i_danchor;
1049 if (noentry) {
1051 * We know from the 1st level dnlc cache that the entry
1052 * doesn't exist, so don't bother searching the directory
1053 * cache, but just look for space (possibly in the directory
1054 * cache).
1056 stat = DNOENT;
1057 } else {
1058 stat = dnlc_dir_lookup(dcap, namep, &handle);
1060 switch (stat) {
1061 case DFOUND:
1062 ep_ino = (ino_t)H_TO_INO(handle);
1063 if (tdp->i_number == ep_ino) {
1064 *ipp = tdp; /* we want ourself, ie "." */
1065 VN_HOLD(dvp);
1066 } else {
1067 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
1068 if (err)
1069 return (err);
1071 offset = H_TO_OFF(handle);
1072 first = 0;
1073 if (offset & 1) {
1074 /* This is the first entry in the block */
1075 first = 1;
1076 offset -= 1;
1077 ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1079 err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1080 if (err) {
1081 VN_RELE(ITOV(*ipp));
1082 *ipp = NULL;
1083 return (err);
1086 * Check the validity of the entry.
1087 * If it's bad, then throw away the cache and
1088 * continue without it. The dirmangled() routine
1089 * will then be called upon it.
1091 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1092 VN_RELE(ITOV(*ipp));
1093 *ipp = NULL;
1094 dnlc_dir_purge(dcap);
1095 break;
1098 * Remember the returned offset is the offset of the
1099 * preceding record (unless this is the 1st record
1100 * in the DIRBLKSIZ sized block (disk sector)), then it's
1101 * offset + 1. Note, no real offsets are on odd boundaries.
1103 if (first) {
1104 ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
1105 slotp->offset = offset;
1106 slotp->size = 0;
1107 slotp->ep = ep;
1108 } else {
1109 /* get the next entry */
1110 nep = (struct direct *)((char *)ep + ep->d_reclen);
1112 * Check the validity of this entry as well
1113 * If it's bad, then throw away the cache and
1114 * continue without it. The dirmangled() routine
1115 * will then be called upon it.
1117 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1118 (nep->d_ino != ep_ino)) {
1119 VN_RELE(ITOV(*ipp));
1120 *ipp = NULL;
1121 dnlc_dir_purge(dcap);
1122 break;
1124 slotp->offset = offset + ep->d_reclen;
1125 slotp->size = ep->d_reclen;
1126 slotp->ep = nep;
1128 slotp->status = EXIST;
1129 slotp->fbp = fbp;
1130 slotp->endoff = 0;
1131 slotp->cached = 1;
1132 dnlc_update(dvp, namep, ITOV(*ipp));
1133 return (0);
1134 case DNOENT:
1136 * The caller gets to set the initial slot status to
1137 * indicate whether it's interested in getting a
1138 * empty slot. For example, the status can be set
1139 * to FOUND when an entry is being deleted.
1141 ASSERT(slotp->fbp == NULL);
1142 if (slotp->status == FOUND) {
1143 return (0);
1145 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
1146 &handle)) {
1147 case DFOUND:
1148 offset = (off_t)handle;
1149 err = blkatoff(tdp, offset, (char **)&ep, &fbp);
1150 if (err) {
1151 dnlc_dir_purge(dcap);
1152 ASSERT(*ipp == NULL);
1153 return (err);
1156 * Check the validity of the entry.
1157 * If it's bad, then throw away the cache and
1158 * continue without it. The dirmangled() routine
1159 * will then be called upon it.
1161 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
1162 dnlc_dir_purge(dcap);
1163 break;
1166 * Remember the returned offset is the offset of the
1167 * containing record.
1169 slotp->status = FOUND;
1170 slotp->ep = ep;
1171 slotp->offset = offset;
1172 slotp->fbp = fbp;
1173 slotp->size = ep->d_reclen;
1175 * Set end offset to 0. Truncation is handled
1176 * because the dnlc cache will blow away the
1177 * cached directory when an entry is removed
1178 * that drops the entries left to less than half
1179 * the minumum number (dnlc_min_dir_cache).
1181 slotp->endoff = 0;
1182 slotp->cached = 1;
1183 return (0);
1184 case DNOENT:
1185 slotp->status = NONE;
1186 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
1187 DIRBLKSIZ, uoff_t);
1188 slotp->size = DIRBLKSIZ;
1189 slotp->endoff = 0;
1190 slotp->cached = 1;
1191 return (0);
1192 default:
1193 break;
1195 break;
1197 slotp->cached = 0;
1198 caching = 0;
1199 if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
1201 * if the directory caching disable time has expired
1202 * enable caching again.
1204 if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
1205 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
1206 ufs_dc_disable_at = 0;
1207 tdp->i_cachedir = CD_ENABLED;
1210 * Attempt to cache any directories greater than the tunable
1211 * ufs_min_cache_dir. If it fails due to memory shortage
1212 * (DNOMEM), disable caching for this directory and record
1213 * the system time. Any attempt after the disable time has
1214 * expired will enable the caching again.
1216 if (tdp->i_cachedir == CD_ENABLED) {
1217 switch (dnlc_dir_start(dcap,
1218 tdp->i_size >> AV_DIRECT_SHIFT)) {
1219 case DNOMEM:
1220 tdp->i_cachedir = CD_DISABLED_NOMEM;
1221 ufs_dc_disable_at = gethrtime();
1222 break;
1223 case DTOOBIG:
1224 tdp->i_cachedir = CD_DISABLED_TOOBIG;
1225 break;
1226 case DOK:
1227 caching = 1;
1228 break;
1229 default:
1230 break;
1236 * No point in using i_diroff since we must search whole directory
1238 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, uoff_t);
1239 enduseful = 0;
1240 offset = last_offset = 0;
1241 entryoffsetinblk = 0;
1242 needed = (int)LDIRSIZ(namlen);
1243 while (offset < dirsize) {
1245 * If offset is on a block boundary,
1246 * read the next directory block.
1247 * Release previous if it exists.
1249 if (blkoff(tdp->i_fs, offset) == 0) {
1250 if (fbp != NULL)
1251 fbrelse(fbp, S_OTHER);
1253 err = blkatoff(tdp, offset, (char **)0, &fbp);
1254 if (err) {
1255 ASSERT(*ipp == NULL);
1256 if (caching) {
1257 dnlc_dir_purge(dcap);
1259 return (err);
1261 entryoffsetinblk = 0;
1264 * If still looking for a slot, and at a DIRBLKSIZ
1265 * boundary, have to start looking for free space
1266 * again.
1268 if (slotp->status == NONE &&
1269 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
1270 slotp->offset = -1;
1273 * If the next entry is a zero length record or if the
1274 * record length is invalid, then skip to the next
1275 * directory block. Complete validation checks are
1276 * done if the record length is invalid.
1278 * Full validation checks are slow so they are disabled
1279 * by default. Complete checks can be run by patching
1280 * "dirchk" to be true.
1282 * We do not have to check the validity of
1283 * entryoffsetinblk here because it starts out as zero
1284 * and is only incremented by d_reclen values that we
1285 * validate here.
1287 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
1288 if (ep->d_reclen == 0 ||
1289 (dirchk || (ep->d_reclen & 0x3)) &&
1290 dirmangled(tdp, ep, entryoffsetinblk, offset)) {
1291 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
1292 offset += i;
1293 entryoffsetinblk += i;
1294 if (caching) {
1295 dnlc_dir_purge(dcap);
1296 caching = 0;
1298 continue;
1302 * Add named entries and free space into the directory cache
1304 if (caching) {
1305 ushort_t extra;
1306 off_t off2;
1308 if (ep->d_ino == 0) {
1309 extra = ep->d_reclen;
1310 if (offset & (DIRBLKSIZ - 1)) {
1311 dnlc_dir_purge(dcap);
1312 caching = 0;
1314 } else {
1316 * entries hold the previous offset if
1317 * not the 1st one
1319 if (offset & (DIRBLKSIZ - 1)) {
1320 off2 = last_offset;
1321 } else {
1322 off2 = offset + 1;
1324 caching = (dnlc_dir_add_entry(dcap, ep->d_name,
1325 INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
1326 extra = ep->d_reclen - DIRSIZ(ep);
1328 if (caching && (extra >= LDIRSIZ(1))) {
1329 caching = (dnlc_dir_add_space(dcap, extra,
1330 (uint64_t)offset) == DOK);
1335 * If an appropriate sized slot has not yet been found,
1336 * check to see if one is available.
1338 if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
1339 int size = ep->d_reclen;
1341 if (ep->d_ino != 0)
1342 size -= DIRSIZ(ep);
1343 if (size > 0) {
1344 if (size >= needed) {
1345 slotp->offset = offset;
1346 slotp->size = ep->d_reclen;
1347 if (noentry) {
1348 slotp->ep = ep;
1349 slotp->fbp = fbp;
1350 slotp->status = FOUND;
1351 slotp->endoff = 0;
1352 return (0);
1354 slotp->status = FOUND;
1355 } else if (slotp->status == NONE) {
1356 if (slotp->offset == -1)
1357 slotp->offset = offset;
1362 * Check for a name match.
1364 if (ep->d_ino && ep->d_namlen == namlen &&
1365 *namep == *ep->d_name && /* fast chk 1st char */
1366 bcmp(namep, ep->d_name, namlen) == 0) {
1368 tdp->i_diroff = offset;
1370 if (tdp->i_number == ep->d_ino) {
1371 *ipp = tdp; /* we want ourself, ie "." */
1372 VN_HOLD(dvp);
1373 } else {
1374 err = ufs_iget_alloced(tdp->i_vfs,
1375 (ino_t)ep->d_ino, ipp, cr);
1376 if (err) {
1377 fbrelse(fbp, S_OTHER);
1378 if (caching)
1379 dnlc_dir_purge(dcap);
1380 return (err);
1383 slotp->status = EXIST;
1384 slotp->offset = offset;
1385 slotp->size = (int)(offset - last_offset);
1386 slotp->fbp = fbp;
1387 slotp->ep = ep;
1388 slotp->endoff = 0;
1389 if (caching)
1390 dnlc_dir_purge(dcap);
1391 return (0);
1393 last_offset = offset;
1394 offset += ep->d_reclen;
1395 entryoffsetinblk += ep->d_reclen;
1396 if (ep->d_ino)
1397 enduseful = offset;
1399 if (fbp) {
1400 fbrelse(fbp, S_OTHER);
1403 if (caching) {
1404 dnlc_dir_complete(dcap);
1405 slotp->cached = 1;
1406 if (slotp->status == FOUND) {
1407 if (initstat == FOUND) {
1408 return (0);
1410 (void) dnlc_dir_rem_space_by_handle(dcap,
1411 slotp->offset);
1412 slotp->endoff = 0;
1413 return (0);
1417 if (slotp->status == NONE) {
1419 * We didn't find a slot; the new directory entry should be put
1420 * at the end of the directory. Return an indication of where
1421 * this is, and set "endoff" to zero; since we're going to have
1422 * to extend the directory, we're certainly not going to
1423 * truncate it.
1425 slotp->offset = dirsize;
1426 slotp->size = DIRBLKSIZ;
1427 slotp->endoff = 0;
1428 } else {
1430 * We found a slot, and will return an indication of where that
1431 * slot is, as any new directory entry will be put there.
1432 * Since that slot will become a useful entry, if the last
1433 * useful entry we found was before this one, update the offset
1434 * of the last useful entry.
1436 if (enduseful < slotp->offset + slotp->size)
1437 enduseful = slotp->offset + slotp->size;
1438 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
1440 *ipp = NULL;
1441 return (0);
1444 uint64_t ufs_dirrename_retry_cnt;
1447 * Rename the entry in the directory tdp so that it points to
1448 * sip instead of tip.
1450 static int
1451 ufs_dirrename(
1452 struct inode *sdp, /* parent directory of source */
1453 struct inode *sip, /* source inode */
1454 struct inode *tdp, /* parent directory of target */
1455 char *namep, /* entry we are trying to change */
1456 struct inode *tip, /* target inode */
1457 struct ufs_slot *slotp, /* slot for entry */
1458 struct cred *cr) /* credentials */
1460 vnode_t *tdvp;
1461 off_t offset;
1462 int err;
1463 int doingdirectory;
1465 ASSERT(sdp->i_ufsvfs != NULL);
1466 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1467 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1469 * Short circuit rename of something to itself.
1471 if (sip->i_number == tip->i_number) {
1472 return (ESAME); /* special KLUDGE error code */
1476 * We're locking 2 peer level locks, so must use tryenter
1477 * on the 2nd to avoid deadlocks that would occur
1478 * if we renamed a->b and b->a concurrently.
1480 retry:
1481 rw_enter(&tip->i_contents, RW_WRITER);
1482 if (!rw_tryenter(&sip->i_contents, RW_READER)) {
1484 * drop tip and wait (sleep) until we stand a chance
1485 * of holding sip
1487 rw_exit(&tip->i_contents);
1488 rw_enter(&sip->i_contents, RW_READER);
1490 * Reverse the lock grabs in case we have heavy
1491 * contention on the 2nd lock.
1493 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
1494 ufs_dirrename_retry_cnt++;
1495 rw_exit(&sip->i_contents);
1496 goto retry;
1501 * Check that everything is on the same filesystem.
1503 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
1504 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
1505 err = EXDEV; /* XXX archaic */
1506 goto out;
1509 * Must have write permission to rewrite target entry.
1510 * Perform additional checks for sticky directories.
1512 if ((err = ufs_iaccess(tdp, IWRITE, cr, 0)) != 0 ||
1513 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
1514 goto out;
1517 * Ensure source and target are compatible (both directories
1518 * or both not directories). If target is a directory it must
1519 * be empty and have no links to it; in addition it must not
1520 * be a mount point, and both the source and target must be
1521 * writable.
1523 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
1524 ((sip->i_mode & IFMT) == IFATTRDIR));
1525 if (((tip->i_mode & IFMT) == IFDIR) ||
1526 ((tip->i_mode & IFMT) == IFATTRDIR)) {
1527 if (!doingdirectory) {
1528 err = EISDIR;
1529 goto out;
1532 * vn_vfsrlock will prevent mounts from using the directory
1533 * until we are done.
1535 if (vn_vfsrlock(ITOV(tip))) {
1536 err = EBUSY;
1537 goto out;
1539 if (vn_mountedvfs(ITOV(tip)) != NULL) {
1540 vn_vfsunlock(ITOV(tip));
1541 err = EBUSY;
1542 goto out;
1544 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
1545 vn_vfsunlock(ITOV(tip));
1546 err = EEXIST; /* SIGH should be ENOTEMPTY */
1547 goto out;
1549 } else if (doingdirectory) {
1550 err = ENOTDIR;
1551 goto out;
1555 * Rewrite the inode pointer for target name entry
1556 * from the target inode (ip) to the source inode (sip).
1557 * This prevents the target entry from disappearing
1558 * during a crash. Mark the directory inode to reflect the changes.
1560 tdvp = ITOV(tdp);
1561 slotp->ep->d_ino = (int32_t)sip->i_number;
1562 dnlc_update(tdvp, namep, ITOV(sip));
1563 if (slotp->size) {
1564 offset = slotp->offset - slotp->size;
1565 } else {
1566 offset = slotp->offset + 1;
1568 if (slotp->cached) {
1569 (void) dnlc_dir_update(&tdp->i_danchor, namep,
1570 INO_OFF_TO_H(slotp->ep->d_ino, offset));
1573 err = TRANS_DIR(tdp, slotp->offset);
1574 if (err)
1575 fbrelse(slotp->fbp, S_OTHER);
1576 else
1577 err = ufs_fbwrite(slotp->fbp, tdp);
1579 slotp->fbp = NULL;
1580 if (err) {
1581 if (doingdirectory)
1582 vn_vfsunlock(ITOV(tip));
1583 goto out;
1586 TRANS_INODE(tdp->i_ufsvfs, tdp);
1587 tdp->i_flag |= IUPD|ICHG;
1588 tdp->i_seq++;
1589 ITIMES_NOLOCK(tdp);
1592 * Decrement the link count of the target inode.
1593 * Fix the ".." entry in sip to point to dp.
1594 * This is done after the new entry is on the disk.
1596 tip->i_nlink--;
1597 TRANS_INODE(tip->i_ufsvfs, tip);
1598 tip->i_flag |= ICHG;
1599 tip->i_seq++;
1600 ITIMES_NOLOCK(tip);
1601 if (doingdirectory) {
1603 * The entry for tip no longer exists so I can unlock the
1604 * vfslock.
1606 vn_vfsunlock(ITOV(tip));
1608 * Decrement target link count once more if it was a directory.
1610 if (--tip->i_nlink != 0) {
1611 err = ufs_fault(ITOV(tip),
1612 "ufs_dirrename: target directory link count != 0 (%s)",
1613 tip->i_fs->fs_fsmnt);
1614 rw_exit(&tip->i_contents);
1615 return (err);
1617 TRANS_INODE(tip->i_ufsvfs, tip);
1618 ufs_setreclaim(tip);
1620 * Renaming a directory with the parent different
1621 * requires that ".." be rewritten. The window is
1622 * still there for ".." to be inconsistent, but this
1623 * is unavoidable, and a lot shorter than when it was
1624 * done in a user process. We decrement the link
1625 * count in the new parent as appropriate to reflect
1626 * the just-removed target. If the parent is the
1627 * same, this is appropriate since the original
1628 * directory is going away. If the new parent is
1629 * different, ufs_dirfixdotdot() will bump the link count
1630 * back.
1632 tdp->i_nlink--;
1633 ufs_setreclaim(tdp);
1634 TRANS_INODE(tdp->i_ufsvfs, tdp);
1635 tdp->i_flag |= ICHG;
1636 tdp->i_seq++;
1637 ITIMES_NOLOCK(tdp);
1638 if (sdp != tdp) {
1639 rw_exit(&tip->i_contents);
1640 rw_exit(&sip->i_contents);
1641 err = ufs_dirfixdotdot(sip, sdp, tdp);
1642 return (err);
1644 } else
1645 ufs_setreclaim(tip);
1646 out:
1647 rw_exit(&tip->i_contents);
1648 rw_exit(&sip->i_contents);
1649 return (err);
1653 * Fix the ".." entry of the child directory so that it points
1654 * to the new parent directory instead of the old one. Routine
1655 * assumes that dp is a directory and that all the inodes are on
1656 * the same file system.
1658 static int
1659 ufs_dirfixdotdot(
1660 struct inode *dp, /* child directory */
1661 struct inode *opdp, /* old parent directory */
1662 struct inode *npdp) /* new parent directory */
1664 struct fbuf *fbp;
1665 struct dirtemplate *dirp;
1666 vnode_t *dvp;
1667 int err;
1669 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
1670 ASSERT(RW_WRITE_HELD(&npdp->i_contents));
1673 * We hold the child directory's i_contents lock before calling
1674 * blkatoff so that we honor correct locking protocol which is
1675 * i_contents lock and then page lock. (blkatoff will call
1676 * ufs_getpage where we want the page lock)
1677 * We hold the child directory's i_rwlock before i_contents (as
1678 * per the locking protocol) since we are modifying the ".." entry
1679 * of the child directory.
1680 * We hold the i_rwlock and i_contents lock until we record
1681 * this directory delta to the log (via ufs_trans_dir) and have
1682 * done fbrelse.
1684 rw_enter(&dp->i_rwlock, RW_WRITER);
1685 rw_enter(&dp->i_contents, RW_WRITER);
1686 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
1687 if (err)
1688 goto bad;
1690 if (dp->i_nlink <= 0 ||
1691 dp->i_size < sizeof (struct dirtemplate)) {
1692 err = ENOENT;
1693 goto bad;
1696 if (dirp->dotdot_namlen != 2 ||
1697 dirp->dotdot_name[0] != '.' ||
1698 dirp->dotdot_name[1] != '.') { /* Sanity check. */
1699 dirbad(dp, "mangled .. entry", (off_t)0);
1700 err = ENOTDIR;
1701 goto bad;
1705 * Increment the link count in the new parent inode and force it out.
1707 if (npdp->i_nlink == MAXLINK) {
1708 err = EMLINK;
1709 goto bad;
1711 npdp->i_nlink++;
1712 TRANS_INODE(npdp->i_ufsvfs, npdp);
1713 npdp->i_flag |= ICHG;
1714 npdp->i_seq++;
1715 ufs_iupdat(npdp, I_SYNC);
1718 * Rewrite the child ".." entry and force it out.
1720 dvp = ITOV(dp);
1721 dirp->dotdot_ino = (uint32_t)npdp->i_number;
1722 dnlc_update(dvp, "..", ITOV(npdp));
1723 (void) dnlc_dir_update(&dp->i_danchor, "..",
1724 INO_OFF_TO_H(dirp->dotdot_ino, 0));
1726 err = TRANS_DIR(dp, 0);
1727 if (err)
1728 fbrelse(fbp, S_OTHER);
1729 else
1730 err = ufs_fbwrite(fbp, dp);
1732 fbp = NULL;
1733 if (err)
1734 goto bad;
1736 rw_exit(&dp->i_contents);
1737 rw_exit(&dp->i_rwlock);
1740 * Decrement the link count of the old parent inode and force it out.
1742 ASSERT(opdp);
1743 rw_enter(&opdp->i_contents, RW_WRITER);
1744 ASSERT(opdp->i_nlink > 0);
1745 opdp->i_nlink--;
1746 ufs_setreclaim(opdp);
1747 TRANS_INODE(opdp->i_ufsvfs, opdp);
1748 opdp->i_flag |= ICHG;
1749 opdp->i_seq++;
1750 ufs_iupdat(opdp, I_SYNC);
1751 rw_exit(&opdp->i_contents);
1752 return (0);
1754 bad:
1755 if (fbp)
1756 fbrelse(fbp, S_OTHER);
1757 rw_exit(&dp->i_contents);
1758 rw_exit(&dp->i_rwlock);
1759 return (err);
1763 * Enter the file sip in the directory tdp with name namep.
1765 static int
1766 ufs_diraddentry(
1767 struct inode *tdp,
1768 char *namep,
1769 enum de_op op,
1770 int namlen,
1771 struct ufs_slot *slotp,
1772 struct inode *sip,
1773 struct inode *sdp,
1774 struct cred *cr)
1776 struct direct *ep, *nep;
1777 vnode_t *tdvp;
1778 dcanchor_t *dcap = &tdp->i_danchor;
1779 off_t offset;
1780 int err;
1781 ushort_t extra;
1783 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
1784 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
1786 * Prepare a new entry. If the caller has not supplied an
1787 * existing inode, make a new one.
1789 err = dirprepareentry(tdp, slotp, cr);
1790 if (err) {
1791 if (slotp->fbp) {
1792 fbrelse(slotp->fbp, S_OTHER);
1793 slotp->fbp = NULL;
1795 return (err);
1798 * Check inode to be linked to see if it is in the
1799 * same filesystem.
1801 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
1802 err = EXDEV;
1803 goto bad;
1807 * If renaming a directory then fix up the ".." entry in the
1808 * directory to point to the new parent.
1810 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
1811 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
1812 err = ufs_dirfixdotdot(sip, sdp, tdp);
1813 if (err)
1814 goto bad;
1818 * Fill in entry data.
1820 ep = slotp->ep;
1821 ep->d_namlen = (ushort_t)namlen;
1822 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
1823 ep->d_ino = (uint32_t)sip->i_number;
1824 tdvp = ITOV(tdp);
1825 dnlc_update(tdvp, namep, ITOV(sip));
1827 * Note the offset supplied for any named entry is
1828 * the offset of the previous one, unless it's the 1st.
1829 * slotp->size is used to pass the length to
1830 * the previous entry.
1832 if (slotp->size) {
1833 offset = slotp->offset - slotp->size;
1834 } else {
1835 offset = slotp->offset + 1;
1838 if (slotp->cached) {
1840 * Add back any usable unused space to the dnlc directory
1841 * cache.
1843 extra = ep->d_reclen - DIRSIZ(ep);
1844 if (extra >= LDIRSIZ(1)) {
1845 (void) dnlc_dir_add_space(dcap, extra,
1846 (uint64_t)slotp->offset);
1849 (void) dnlc_dir_add_entry(dcap, namep,
1850 INO_OFF_TO_H(ep->d_ino, offset));
1852 /* adjust the previous offset of the next entry */
1853 nep = (struct direct *)((char *)ep + ep->d_reclen);
1854 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
1856 * Not a new block.
1858 * Check the validity of the next entry.
1859 * If it's bad, then throw away the cache, and
1860 * continue as before directory caching.
1862 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
1863 dnlc_dir_update(dcap, nep->d_name,
1864 INO_OFF_TO_H(nep->d_ino, slotp->offset))
1865 == DNOENT) {
1866 dnlc_dir_purge(dcap);
1867 slotp->cached = 0;
1873 * Write out the directory block.
1875 err = TRANS_DIR(tdp, slotp->offset);
1876 if (err)
1877 fbrelse(slotp->fbp, S_OTHER);
1878 else
1879 err = ufs_fbwrite(slotp->fbp, tdp);
1881 slotp->fbp = NULL;
1883 * If this is a rename of a directory, then we have already
1884 * fixed the ".." entry to refer to the new parent. If err
1885 * is true at this point, we have failed to update the new
1886 * parent to refer to the renamed directory.
1887 * XXX - we need to unwind the ".." fix.
1889 if (err)
1890 return (err);
1893 * Mark the directory inode to reflect the changes.
1894 * Truncate the directory to chop off blocks of empty entries.
1897 TRANS_INODE(tdp->i_ufsvfs, tdp);
1898 tdp->i_flag |= IUPD|ICHG;
1899 tdp->i_seq++;
1900 tdp->i_diroff = 0;
1901 ITIMES_NOLOCK(tdp);
1903 * If the directory grew then dirprepareentry() will have
1904 * set IATTCHG in tdp->i_flag, then the directory inode must
1905 * be flushed out. This is because if fsync() is used later
1906 * the directory size must be correct, otherwise a crash would
1907 * cause fsck to move the file to lost+found. Also because later
1908 * a file may be linked in more than one directory, then there
1909 * is no way to flush the original directory. So it must be
1910 * flushed out on creation. See bug 4293809.
1912 if (tdp->i_flag & IATTCHG) {
1913 ufs_iupdat(tdp, I_SYNC);
1916 if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
1917 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
1918 (void) ufs_itrunc(tdp, (uoff_t)slotp->endoff, 0,
1919 cr);
1924 return (0);
1926 bad:
1927 if (slotp->cached) {
1928 dnlc_dir_purge(dcap);
1929 fbrelse(slotp->fbp, S_OTHER);
1930 slotp->cached = 0;
1931 slotp->fbp = NULL;
1932 return (err);
1936 * Clear out entry prepared by dirprepareent.
1938 slotp->ep->d_ino = 0;
1939 slotp->ep->d_namlen = 0;
1942 * Don't touch err so we don't clobber the real error that got us here.
1944 if (TRANS_DIR(tdp, slotp->offset))
1945 fbrelse(slotp->fbp, S_OTHER);
1946 else
1947 (void) ufs_fbwrite(slotp->fbp, tdp);
1948 slotp->fbp = NULL;
1949 return (err);
1953 * Prepare a directory slot to receive an entry.
1955 static int
1956 dirprepareentry(
1957 struct inode *dp, /* directory we are working in */
1958 struct ufs_slot *slotp, /* available slot info */
1959 struct cred *cr)
1961 struct direct *ep, *nep;
1962 off_t entryend;
1963 int err;
1964 slotstat_t status = slotp->status;
1965 ushort_t dsize;
1967 ASSERT((status == NONE) || (status == FOUND));
1968 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
1969 ASSERT(RW_WRITE_HELD(&dp->i_contents));
1971 * If we didn't find a slot, then indicate that the
1972 * new slot belongs at the end of the directory.
1973 * If we found a slot, then the new entry can be
1974 * put at slotp->offset.
1976 entryend = slotp->offset + slotp->size;
1977 if (status == NONE) {
1978 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
1979 if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
1980 err = ufs_fault(ITOV(dp),
1981 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1982 " > dp->i_fs->fs_fsize: %d (%s)",
1983 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
1984 return (err);
1987 * Allocate the new block.
1989 err = BMAPALLOC(dp, (uoff_t)slotp->offset,
1990 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
1991 if (err) {
1992 return (err);
1994 dp->i_size = entryend;
1995 TRANS_INODE(dp->i_ufsvfs, dp);
1996 dp->i_flag |= IUPD|ICHG|IATTCHG;
1997 dp->i_seq++;
1998 ITIMES_NOLOCK(dp);
1999 } else if (entryend > dp->i_size) {
2001 * Adjust directory size, if needed. This should never
2002 * push the size past a new multiple of DIRBLKSIZ.
2003 * This is an artifact of the old (4.2BSD) way of initializing
2004 * directory sizes to be less than DIRBLKSIZ.
2006 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
2007 TRANS_INODE(dp->i_ufsvfs, dp);
2008 dp->i_flag |= IUPD|ICHG|IATTCHG;
2009 dp->i_seq++;
2010 ITIMES_NOLOCK(dp);
2014 * Get the block containing the space for the new directory entry.
2016 if (slotp->fbp == NULL) {
2017 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
2018 &slotp->fbp);
2019 if (err) {
2020 return (err);
2023 ep = slotp->ep;
2025 switch (status) {
2026 case NONE:
2028 * No space in the directory. slotp->offset will be on a
2029 * directory block boundary and we will write the new entry
2030 * into a fresh block.
2032 ep->d_reclen = DIRBLKSIZ;
2033 slotp->size = 0; /* length of previous entry */
2034 break;
2035 case FOUND:
2037 * An entry of the required size has been found. Use it.
2039 if (ep->d_ino == 0) {
2040 /* this is the 1st record in a block */
2041 slotp->size = 0; /* length of previous entry */
2042 } else {
2043 dsize = DIRSIZ(ep);
2044 nep = (struct direct *)((char *)ep + dsize);
2045 nep->d_reclen = ep->d_reclen - dsize;
2046 ep->d_reclen = dsize;
2047 slotp->ep = nep;
2048 slotp->offset += dsize;
2049 slotp->size = dsize; /* length of previous entry */
2051 break;
2052 default:
2053 break;
2055 return (0);
2059 * Allocate and initialize a new inode that will go into directory tdp.
2060 * This routine is called from ufs_symlink(), as well as within this file.
2063 ufs_dirmakeinode(
2064 struct inode *tdp,
2065 struct inode **ipp,
2066 struct vattr *vap,
2067 enum de_op op,
2068 struct cred *cr)
2070 struct inode *ip;
2071 enum vtype type;
2072 int imode; /* mode and format as in inode */
2073 ino_t ipref;
2074 int err;
2075 timestruc_t now;
2077 ASSERT(vap != NULL);
2078 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
2079 op == DE_SYMLINK);
2080 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
2081 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
2082 ASSERT(RW_WRITE_HELD(&tdp->i_contents));
2084 * Allocate a new inode.
2086 type = vap->va_type;
2087 if (type == VDIR) {
2088 ipref = dirpref(tdp);
2089 } else {
2090 ipref = tdp->i_number;
2092 if (op == DE_ATTRDIR)
2093 imode = vap->va_mode;
2094 else
2095 imode = MAKEIMODE(type, vap->va_mode);
2096 *ipp = NULL;
2097 err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
2098 if (err)
2099 return (err);
2102 * We don't need to grab vfs_dqrwlock here because it is held
2103 * in ufs_direnter_*() above us.
2105 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
2106 rw_enter(&ip->i_contents, RW_WRITER);
2107 if (ip->i_dquot != NULL) {
2108 err = ufs_fault(ITOV(ip),
2109 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2110 tdp->i_fs->fs_fsmnt);
2111 rw_exit(&ip->i_contents);
2112 return (err);
2114 *ipp = ip;
2115 ip->i_mode = (o_mode_t)imode;
2116 if (type == VBLK || type == VCHR) {
2117 dev_t d = vap->va_rdev;
2118 dev32_t dev32;
2121 * Don't allow a special file to be created with a
2122 * dev_t that cannot be represented by this filesystem
2123 * format on disk.
2125 if (!cmpldev(&dev32, d)) {
2126 err = EOVERFLOW;
2127 goto fail;
2130 ITOV(ip)->v_rdev = ip->i_rdev = d;
2132 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
2133 ip->i_ordev = dev32; /* can't use old format */
2134 } else {
2135 ip->i_ordev = cmpdev(d);
2138 ITOV(ip)->v_type = type;
2139 ufs_reset_vnode(ip->i_vnode);
2140 if (type == VDIR) {
2141 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
2142 } else {
2143 ip->i_nlink = 1;
2146 if (op == DE_ATTRDIR) {
2147 ip->i_uid = vap->va_uid;
2148 ip->i_gid = vap->va_gid;
2149 } else
2150 ip->i_uid = crgetuid(cr);
2152 * To determine the group-id of the created file:
2153 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0
2154 * clients are not likely to set the gid), then use it if
2155 * the process is privileged, belongs to the target group,
2156 * or the group is the same as the parent directory.
2157 * 2) If the filesystem was not mounted with the Old-BSD-compatible
2158 * GRPID option, and the directory's set-gid bit is clear,
2159 * then use the process's gid.
2160 * 3) Otherwise, set the group-id to the gid of the parent directory.
2162 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
2163 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
2164 secpolicy_vnode_create_gid(cr) == 0)) {
2166 * XXX - is this only the case when a 4.0 NFS client, or a
2167 * client derived from that code, makes a call over the wire?
2169 ip->i_gid = vap->va_gid;
2170 } else
2171 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);
2174 * For SunOS 5.0->5.4, the lines below read:
2176 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2177 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2179 * where MAXUID was set to 60002. See notes on this in ufs_inode.c
2181 ip->i_suid =
2182 (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid;
2183 ip->i_sgid =
2184 (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid;
2187 * If we're creating a directory, and the parent directory has the
2188 * set-GID bit set, set it on the new directory.
2189 * Otherwise, if the user is neither privileged nor a member of the
2190 * file's new group, clear the file's set-GID bit.
2192 if ((tdp->i_mode & ISGID) && (type == VDIR))
2193 ip->i_mode |= ISGID;
2194 else {
2195 if ((ip->i_mode & ISGID) &&
2196 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
2197 ip->i_mode &= ~ISGID;
2200 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2201 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2202 err = EOVERFLOW;
2203 goto fail;
2207 * Extended attribute directories are not subject to quotas.
2209 if (op != DE_ATTRDIR)
2210 ip->i_dquot = getinoquota(ip);
2211 else
2212 ip->i_dquot = NULL;
2214 if (op == DE_MKDIR || op == DE_ATTRDIR) {
2215 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
2216 if (err)
2217 goto fail;
2221 * generate the shadow inode and attach it to the new object
2223 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
2224 (!tdp->i_shadow && !tdp->i_ufs_acl));
2225 if (tdp->i_shadow && tdp->i_ufs_acl &&
2226 (((tdp->i_mode & IFMT) == IFDIR) ||
2227 ((tdp->i_mode & IFMT) == IFATTRDIR))) {
2228 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
2229 if (err) {
2230 if (op == DE_MKDIR) {
2232 * clean up parent directory
2234 * tdp->i_contents already locked from
2235 * ufs_direnter_*()
2237 tdp->i_nlink--;
2238 TRANS_INODE(tdp->i_ufsvfs, tdp);
2239 tdp->i_flag |= ICHG;
2240 tdp->i_seq++;
2241 ufs_iupdat(tdp, I_SYNC);
2243 goto fail;
2248 * If the passed in attributes contain atime and/or mtime
2249 * settings, then use them instead of using the current
2250 * high resolution time.
2252 if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
2253 if (vap->va_mask & AT_ATIME) {
2254 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2255 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2256 ip->i_flag &= ~IACC;
2257 } else
2258 ip->i_flag |= IACC;
2259 if (vap->va_mask & AT_MTIME) {
2260 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2261 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2262 gethrestime(&now);
2263 if (now.tv_sec > TIME32_MAX) {
2265 * In 2038, ctime sticks forever..
2267 ip->i_ctime.tv_sec = TIME32_MAX;
2268 ip->i_ctime.tv_usec = 0;
2269 } else {
2270 ip->i_ctime.tv_sec = now.tv_sec;
2271 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2273 ip->i_flag &= ~(IUPD|ICHG);
2274 ip->i_flag |= IMODTIME;
2275 } else
2276 ip->i_flag |= IUPD|ICHG;
2277 ip->i_flag |= IMOD;
2278 } else
2279 ip->i_flag |= IACC|IUPD|ICHG;
2280 ip->i_seq++;
2283 * If this is an attribute tag it as one.
2285 if ((tdp->i_mode & IFMT) == IFATTRDIR) {
2286 ip->i_cflags |= IXATTR;
2290 * push inode before it's name appears in a directory
2292 TRANS_INODE(ip->i_ufsvfs, ip);
2293 ufs_iupdat(ip, I_SYNC);
2294 rw_exit(&ip->i_contents);
2295 return (0);
2297 fail:
2298 /* Throw away inode we just allocated. */
2299 ip->i_nlink = 0;
2300 ufs_setreclaim(ip);
2301 TRANS_INODE(ip->i_ufsvfs, ip);
2302 ip->i_flag |= ICHG;
2303 ip->i_seq++;
2304 ITIMES_NOLOCK(ip);
2305 rw_exit(&ip->i_contents);
2306 return (err);
2310 * Write a prototype directory into the empty inode ip, whose parent is dp.
2312 static int
2313 ufs_dirmakedirect(
2314 struct inode *ip, /* new directory */
2315 struct inode *dp, /* parent directory */
2316 int attrdir,
2317 struct cred *cr)
2319 struct dirtemplate *dirp;
2320 struct fbuf *fbp;
2321 int err;
2323 ASSERT(RW_WRITE_HELD(&ip->i_contents));
2324 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2325 ASSERT(RW_WRITE_HELD(&dp->i_contents));
2327 * Allocate space for the directory we're creating.
2329 err = BMAPALLOC(ip, 0, DIRBLKSIZ, cr);
2330 if (err)
2331 return (err);
2332 if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
2333 err = ufs_fault(ITOV(dp),
2334 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2335 DIRBLKSIZ, dp->i_fs->fs_fsize,
2336 dp->i_fs->fs_fsmnt);
2337 return (err);
2339 ip->i_size = DIRBLKSIZ;
2340 TRANS_INODE(ip->i_ufsvfs, ip);
2341 ip->i_flag |= IUPD|ICHG|IATTCHG;
2342 ip->i_seq++;
2343 ITIMES_NOLOCK(ip);
2345 * Update the tdp link count and write out the change.
2346 * This reflects the ".." entry we'll soon write.
2348 if (dp->i_nlink == MAXLINK)
2349 return (EMLINK);
2350 if (attrdir == 0)
2351 dp->i_nlink++;
2352 TRANS_INODE(dp->i_ufsvfs, dp);
2353 dp->i_flag |= ICHG;
2354 dp->i_seq++;
2355 ufs_iupdat(dp, I_SYNC);
2357 * Initialize directory with "."
2358 * and ".." from static template.
2360 * Since the parent directory is locked, we don't have to
2361 * worry about anything changing when we drop the write
2362 * lock on (ip).
2365 err = fbread(ITOV(ip), 0, (uint_t)ip->i_fs->fs_fsize,
2366 S_READ, &fbp);
2368 if (err) {
2369 goto fail;
2371 dirp = (struct dirtemplate *)fbp->fb_addr;
2373 * Now initialize the directory we're creating
2374 * with the "." and ".." entries.
2376 *dirp = mastertemplate; /* structure assignment */
2377 dirp->dot_ino = (uint32_t)ip->i_number;
2378 dirp->dotdot_ino = (uint32_t)dp->i_number;
2380 err = TRANS_DIR(ip, 0);
2381 if (err) {
2382 fbrelse(fbp, S_OTHER);
2383 goto fail;
2386 err = ufs_fbwrite(fbp, ip);
2387 if (err) {
2388 goto fail;
2391 return (0);
2393 fail:
2394 if (attrdir == 0)
2395 dp->i_nlink--;
2396 TRANS_INODE(dp->i_ufsvfs, dp);
2397 dp->i_flag |= ICHG;
2398 dp->i_seq++;
2399 ufs_iupdat(dp, I_SYNC);
2400 return (err);
2404 * Delete a directory entry. If oip is nonzero the entry is checked
2405 * to make sure it still reflects oip.
2408 ufs_dirremove(
2409 struct inode *dp,
2410 char *namep,
2411 struct inode *oip,
2412 struct vnode *cdir,
2413 enum dr_op op,
2414 struct cred *cr)
2416 struct direct *ep, *pep, *nep;
2417 struct inode *ip;
2418 vnode_t *dvp, *vp;
2419 struct ufs_slot slot;
2420 int namlen;
2421 int err;
2422 int mode;
2423 ushort_t extra;
2425 namlen = (int)strlen(namep);
2426 if (namlen == 0) {
2427 struct fs *fs = dp->i_fs;
2429 cmn_err(CE_WARN, "%s: ufs_dirremove: attempted to remove"
2430 " nameless file in directory (directory inode %llu)",
2431 fs->fs_fsmnt, (u_longlong_t)dp->i_number);
2432 ASSERT(namlen != 0);
2434 return (ENOENT);
2438 * return error when removing . and ..
2440 if (namep[0] == '.') {
2441 if (namlen == 1)
2442 return (EINVAL);
2443 else if (namlen == 2 && namep[1] == '.') {
2444 return (EEXIST); /* SIGH should be ENOTEMPTY */
2448 ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
2450 retry:
2452 * Check accessibility of directory.
2454 if (err = ufs_diraccess(dp, IEXEC|IWRITE, cr))
2455 return (err);
2457 ip = NULL;
2458 slot.fbp = NULL;
2459 slot.status = FOUND; /* don't need to look for empty slot */
2460 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
2461 rw_enter(&dp->i_contents, RW_WRITER);
2463 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
2464 if (err)
2465 goto out_novfs;
2466 if (ip == NULL) {
2467 err = ENOENT;
2468 goto out_novfs;
2470 vp = ITOV(ip);
2471 if (oip && oip != ip) {
2472 err = ENOENT;
2473 goto out_novfs;
2476 mode = ip->i_mode & IFMT;
2477 if (mode == IFDIR || mode == IFATTRDIR) {
2480 * vn_vfsrlock() prevents races between mount and rmdir.
2482 if (vn_vfsrlock(vp)) {
2483 err = EBUSY;
2484 goto out_novfs;
2486 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
2487 err = EBUSY;
2488 goto out;
2491 * If we are removing a directory, get a lock on it.
2492 * Taking a writer lock prevents a parallel ufs_dirlook from
2493 * incorrectly entering a negative cache vnode entry in the dnlc
2494 * If the directory is empty, it will stay empty until
2495 * we can remove it.
2497 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
2499 * It is possible that a thread in rename would have
2500 * acquired this rwlock. To prevent a deadlock we
2501 * do a rw_tryenter. If we fail to get the lock
2502 * we drop all the locks we have acquired, wait
2503 * for 2 ticks and reacquire the
2504 * directory's (dp) i_rwlock and try again.
2505 * If we dont drop dp's i_rwlock then we will panic
2506 * with a "Deadlock: cycle in blocking chain"
2507 * since in ufs_dircheckpath we want dp's i_rwlock.
2508 * dp is guaranteed to exist since ufs_dirremove is
2509 * called after a VN_HOLD(dp) has been done.
2511 ufs_dirremove_retry_cnt++;
2512 vn_vfsunlock(vp);
2513 if (slot.fbp)
2514 fbrelse(slot.fbp, S_OTHER);
2515 rw_exit(&dp->i_contents);
2516 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2517 rw_exit(&dp->i_rwlock);
2518 VN_RELE(vp);
2519 delay(2);
2520 rw_enter(&dp->i_rwlock, RW_WRITER);
2521 goto retry;
2524 rw_enter(&ip->i_contents, RW_READER);
2527 * Now check the restrictions that apply on sticky directories.
2529 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
2530 rw_exit(&ip->i_contents);
2531 if (mode == IFDIR || mode == IFATTRDIR)
2532 rw_exit(&ip->i_rwlock);
2533 goto out;
2536 if (op == DR_RMDIR) {
2538 * For rmdir(2), some special checks are required.
2539 * (a) Don't remove any alias of the parent (e.g. ".").
2540 * (b) Don't remove the current directory.
2541 * (c) Make sure the entry is (still) a directory.
2542 * (d) Make sure the directory is empty.
2545 if (dp == ip || vp == cdir)
2546 err = EINVAL;
2547 else if (((ip->i_mode & IFMT) != IFDIR) &&
2548 ((ip->i_mode & IFMT) != IFATTRDIR))
2549 err = ENOTDIR;
2550 else if ((ip->i_nlink > 2) ||
2551 !ufs_dirempty(ip, dp->i_number, cr)) {
2552 err = EEXIST; /* SIGH should be ENOTEMPTY */
2555 if (err) {
2556 rw_exit(&ip->i_contents);
2557 if (mode == IFDIR || mode == IFATTRDIR)
2558 rw_exit(&ip->i_rwlock);
2559 goto out;
2561 } else if (op == DR_REMOVE) {
2563 * unlink(2) requires a different check: allow only
2564 * privileged users to unlink a directory.
2566 if (vp->v_type == VDIR &&
2567 secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
2568 err = EPERM;
2569 rw_exit(&ip->i_contents);
2570 rw_exit(&ip->i_rwlock);
2571 goto out;
2575 rw_exit(&ip->i_contents);
2578 * Remove the cache'd entry, if any.
2580 dvp = ITOV(dp);
2581 dnlc_remove(dvp, namep);
2582 ep = slot.ep;
2583 ep->d_ino = 0;
2585 if (slot.cached) {
2586 dcanchor_t *dcap = &dp->i_danchor;
2588 (void) dnlc_dir_rem_entry(dcap, namep, NULL);
2589 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
2590 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
2592 if (slot.offset & (DIRBLKSIZ - 1)) {
2594 * Collapse new free space into previous entry.
2595 * Note, the previous entry has already been
2596 * validated in ufs_dircheckforname().
2598 ASSERT(slot.size);
2599 pep = (struct direct *)((char *)ep - slot.size);
2600 if ((pep->d_ino == 0) &&
2601 ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
2602 dnlc_dir_purge(dcap);
2603 slot.cached = 0;
2604 goto nocache;
2606 if (pep->d_ino) {
2607 extra = pep->d_reclen - DIRSIZ(pep);
2608 } else {
2609 extra = pep->d_reclen;
2611 if (extra >= LDIRSIZ(1)) {
2612 (void) dnlc_dir_rem_space_by_handle(dcap,
2613 (uint64_t)(slot.offset - slot.size));
2615 pep->d_reclen += ep->d_reclen;
2616 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
2617 (uint64_t)(slot.offset - slot.size));
2618 /* adjust the previous pointer in the next entry */
2619 nep = (struct direct *)((char *)ep + ep->d_reclen);
2620 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
2622 * Not a new block.
2624 * Check the validity of the entry.
2625 * If it's bad, then throw away the cache and
2626 * continue.
2628 if ((nep->d_reclen == 0) ||
2629 (nep->d_reclen & 0x3) ||
2630 (dnlc_dir_update(dcap, nep->d_name,
2631 INO_OFF_TO_H(nep->d_ino,
2632 slot.offset - slot.size)) == DNOENT)) {
2633 dnlc_dir_purge(dcap);
2634 slot.cached = 0;
2637 } else {
2638 (void) dnlc_dir_add_space(dcap, ep->d_reclen,
2639 (uint64_t)slot.offset);
2641 } else {
2643 * If the entry isn't the first in the directory, we must
2644 * reclaim the space of the now empty record by adding
2645 * the record size to the size of the previous entry.
2647 if (slot.offset & (DIRBLKSIZ - 1)) {
2649 * Collapse new free space into previous entry.
2651 pep = (struct direct *)((char *)ep - slot.size);
2652 pep->d_reclen += ep->d_reclen;
2655 nocache:
2658 err = TRANS_DIR(dp, slot.offset);
2659 if (err)
2660 fbrelse(slot.fbp, S_OTHER);
2661 else
2662 err = ufs_fbwrite(slot.fbp, dp);
2663 slot.fbp = NULL;
2666 * If we were removing a directory, it is 'gone' now, but we cannot
2667 * unlock it as a thread may be waiting for the lock in ufs_create. If
2668 * we did, it could then create a file in a deleted directory.
2671 if (err) {
2672 if (mode == IFDIR || mode == IFATTRDIR)
2673 rw_exit(&ip->i_rwlock);
2674 goto out;
2677 rw_enter(&ip->i_contents, RW_WRITER);
2679 dp->i_flag |= IUPD|ICHG;
2680 dp->i_seq++;
2681 ip->i_flag |= ICHG;
2682 ip->i_seq++;
2684 TRANS_INODE(dp->i_ufsvfs, dp);
2685 TRANS_INODE(ip->i_ufsvfs, ip);
2687 * Now dispose of the inode.
2689 if (ip->i_nlink > 0) {
2691 * This is not done for IFATTRDIR's because they don't
2692 * have entries in the dnlc and the link counts are
2693 * not incremented when they are created.
2695 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
2697 * Decrement by 2 because we're trashing the "."
2698 * entry as well as removing the entry in dp.
2699 * Clear the directory entry, but there may be
2700 * other hard links so don't free the inode.
2701 * Decrement the dp linkcount because we're
2702 * trashing the ".." entry.
2704 ip->i_nlink -= 2;
2705 dp->i_nlink--;
2706 ufs_setreclaim(dp);
2708 * XXX need to discard negative cache entries
2709 * for vp. See comment in ufs_delete().
2711 dnlc_remove(vp, ".");
2712 dnlc_remove(vp, "..");
2714 * The return value is ignored here bacause if
2715 * the directory purge fails we don't want to
2716 * stop the delete. If ufs_dirpurgedotdot fails
2717 * the delete will continue with the preexiting
2718 * behavior.
2720 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
2721 } else {
2722 ip->i_nlink--;
2724 ufs_setreclaim(ip);
2726 ITIMES_NOLOCK(dp);
2727 ITIMES_NOLOCK(ip);
2729 if (!TRANS_ISTRANS(dp->i_ufsvfs))
2730 ufs_iupdat(dp, I_SYNC);
2731 if (!TRANS_ISTRANS(ip->i_ufsvfs))
2732 ufs_iupdat(ip, I_SYNC);
2734 rw_exit(&ip->i_contents);
2735 if (mode == IFDIR || mode == IFATTRDIR)
2736 rw_exit(&ip->i_rwlock);
2737 out:
2738 if (mode == IFDIR || mode == IFATTRDIR) {
2739 vn_vfsunlock(vp);
2741 out_novfs:
2742 ASSERT(RW_WRITE_HELD(&dp->i_contents));
2744 if (slot.fbp)
2745 fbrelse(slot.fbp, S_OTHER);
2747 rw_exit(&dp->i_contents);
2748 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
2751 * Release (and delete) the inode after we drop vfs_dqrwlock to
2752 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2754 if (ip)
2755 VN_RELE(vp);
2757 return (err);
2761 * Return buffer with contents of block "offset"
2762 * from the beginning of directory "ip". If "res"
2763 * is non-zero, fill it in with a pointer to the
2764 * remaining space in the directory.
2769 blkatoff(
2770 struct inode *ip,
2771 off_t offset,
2772 char **res,
2773 struct fbuf **fbpp)
2775 struct fs *fs;
2776 struct fbuf *fbp;
2777 daddr_t lbn;
2778 uint_t bsize;
2779 int err;
2781 CPU_STATS_ADD_K(sys, ufsdirblk, 1);
2782 fs = ip->i_fs;
2783 lbn = (daddr_t)lblkno(fs, offset);
2784 bsize = (uint_t)blksize(fs, ip, lbn);
2785 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
2786 bsize, S_READ, &fbp);
2787 if (err) {
2788 *fbpp = NULL;
2789 return (err);
2791 if (res)
2792 *res = fbp->fb_addr + blkoff(fs, offset);
2793 *fbpp = fbp;
2794 return (0);
2798 * Do consistency checking:
2799 * record length must be multiple of 4
2800 * entry must fit in rest of its DIRBLKSIZ block
2801 * record must be large enough to contain entry
2802 * name is not longer than MAXNAMLEN
2803 * name must be as long as advertised, and null terminated
2804 * NOTE: record length must not be zero (should be checked previously).
2805 * This routine is only called if dirchk is true.
2806 * It would be nice to set the FSBAD flag in the super-block when
2807 * this routine fails so that a fsck is forced on next reboot,
2808 * but locking is a problem.
2810 static int
2811 dirmangled(
2812 struct inode *dp,
2813 struct direct *ep,
2814 int entryoffsetinblock,
2815 off_t offset)
2817 int i;
2819 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
2820 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
2821 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
2822 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
2823 dirbad(dp, "mangled entry", offset);
2824 return (1);
2826 return (0);
2829 static void
2830 dirbad(struct inode *ip, char *how, off_t offset)
2832 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
2833 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
2836 static int
2837 dirbadname(char *sp, int l)
2839 while (l--) { /* check for nulls */
2840 if (*sp++ == '\0') {
2841 return (1);
2844 return (*sp); /* check for terminating null */
2848 * Check if a directory is empty or not.
2850 static int
2851 ufs_dirempty(
2852 struct inode *ip,
2853 ino_t parentino,
2854 struct cred *cr)
2856 return (ufs_dirscan(ip, parentino, cr, 0));
2860 * clear the .. directory entry.
2862 static int
2863 ufs_dirpurgedotdot(
2864 struct inode *ip,
2865 ino_t parentino,
2866 struct cred *cr)
2868 return (ufs_dirscan(ip, parentino, cr, 1));
2872 * Scan the directoy. If clr_dotdot is true clear the ..
2873 * directory else check to see if the directory is empty.
2875 * Using a struct dirtemplate here is not precisely
2876 * what we want, but better than using a struct direct.
2878 * clr_dotdot is used as a flag to tell us if we need
2879 * to clear the dotdot entry
2881 * N.B.: does not handle corrupted directories.
2883 static int
2884 ufs_dirscan(
2885 struct inode *ip,
2886 ino_t parentino,
2887 struct cred *cr,
2888 int clr_dotdot)
2890 offset_t off;
2891 struct dirtemplate dbuf;
2892 struct direct *dp = (struct direct *)&dbuf;
2893 int err, count;
2894 int empty = 1; /* Assume it's empty */
2895 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2897 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2899 ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
2900 for (off = 0; off < ip->i_size; off += dp->d_reclen) {
2901 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
2902 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
2904 * Since we read MINDIRSIZ, residual must
2905 * be 0 unless we're at end of file.
2907 if (err || count != 0 || dp->d_reclen == 0) {
2908 empty = 0;
2909 break;
2911 /* skip empty entries */
2912 if (dp->d_ino == 0)
2913 continue;
2914 /* accept only "." and ".." */
2915 if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
2916 empty = 0;
2917 break;
2920 * At this point d_namlen must be 1 or 2.
2921 * 1 implies ".", 2 implies ".." if second
2922 * char is also "."
2924 if (dp->d_namlen == 1)
2925 continue;
2926 if (dp->d_name[1] == '.' &&
2927 (ino_t)dp->d_ino == parentino) {
2929 * If we're doing a purge we need to check for
2930 * the . and .. entries and clear the d_ino for ..
2932 * if clr_dotdot is set ufs_dirscan does not
2933 * check for an empty directory.
2935 if (clr_dotdot) {
2937 * Have to actually zap the ..
2938 * entry in the directory, as
2939 * otherwise someone might have
2940 * dp as its cwd and try to
2941 * open .., which now points to
2942 * an unallocated inode.
2944 empty = ufs_dirclrdotdot(ip, parentino);
2945 break;
2946 } else {
2947 continue;
2950 empty = 0;
2951 break;
2953 return (empty);
2956 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
2957 uint64_t dircheck_retry_cnt;
2959 * Check if source directory inode is in the path of the target directory.
2960 * Target is supplied locked.
2962 * The source and target inode's should be different upon entry.
2965 ufs_dircheckpath(
2966 ino_t source_ino,
2967 struct inode *target,
2968 struct inode *sdp,
2969 struct cred *cr)
2971 struct fbuf *fbp;
2972 struct dirtemplate *dirp;
2973 struct inode *ip;
2974 struct ufsvfs *ufsvfsp;
2975 struct inode *tip;
2976 ino_t dotdotino;
2977 int err;
2979 ASSERT(target->i_ufsvfs != NULL);
2980 ASSERT(RW_LOCK_HELD(&target->i_rwlock));
2981 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));
2983 ip = target;
2984 if (ip->i_number == source_ino) {
2985 err = EINVAL;
2986 goto out;
2988 if (ip->i_number == UFSROOTINO) {
2989 err = 0;
2990 goto out;
2993 * Search back through the directory tree, using the ".." entries.
2994 * Fail any attempt to move a directory into an ancestor directory.
2996 fbp = NULL;
2997 for (;;) {
2998 struct vfs *vfs;
3000 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
3001 if (err)
3002 break;
3003 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
3004 ip->i_size < sizeof (struct dirtemplate)) {
3005 dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
3006 err = ENOTDIR;
3007 break;
3009 if (dirp->dotdot_namlen != 2 ||
3010 dirp->dotdot_name[0] != '.' ||
3011 dirp->dotdot_name[1] != '.') {
3012 dirbad(ip, "mangled .. entry", (off_t)0);
3013 err = ENOTDIR; /* Sanity check */
3014 break;
3016 dotdotino = (ino_t)dirp->dotdot_ino;
3017 if (dotdotino == source_ino) {
3018 err = EINVAL;
3019 break;
3021 if (dotdotino == UFSROOTINO)
3022 break;
3023 if (fbp) {
3024 fbrelse(fbp, S_OTHER);
3025 fbp = NULL;
3027 vfs = ip->i_vfs;
3028 ufsvfsp = ip->i_ufsvfs;
3030 if (ip != target) {
3031 rw_exit(&ip->i_rwlock);
3032 VN_RELE(ITOV(ip));
3035 * Race to get the inode.
3037 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3038 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
3039 rw_exit(&ufsvfsp->vfs_dqrwlock);
3040 ip = NULL;
3041 break;
3043 rw_exit(&ufsvfsp->vfs_dqrwlock);
3045 * If the directory of the source inode (also a directory)
3046 * is the same as this next entry up the chain, then
3047 * we know the source directory itself can't be in the
3048 * chain. This also prevents a panic because we already
3049 * have sdp->i_rwlock locked.
3051 if (tip == sdp) {
3052 VN_RELE(ITOV(tip));
3053 ip = NULL;
3054 break;
3056 ip = tip;
3059 * If someone has set the WRITE_WANTED bit in this lock and if
3060 * this happens to be a sdp or tdp of another parallel rename
3061 * which is executing the same code and in similar situation
3062 * we end up in a 4 way deadlock. We need to make sure that
3063 * the WRITE_WANTED bit is not set.
3065 retry_lock:
3066 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
3068 * If the lock held as WRITER thats fine but if it
3069 * has WRITE_WANTED bit set we might end up in a
3070 * deadlock. If WRITE_WANTED is set we return
3071 * with EAGAIN else we just go back and try.
3073 if (RW_ISWRITER(&ip->i_rwlock) &&
3074 !(RW_WRITE_HELD(&ip->i_rwlock))) {
3075 err = EAGAIN;
3076 if (fbp) {
3077 fbrelse(fbp, S_OTHER);
3079 VN_RELE(ITOV(ip));
3080 return (err);
3081 } else {
3083 * The lock is being write held. We could
3084 * just do a rw_enter here but there is a
3085 * window between the check and now, where
3086 * the status could have changed, so to
3087 * avoid looping we backoff and go back to
3088 * try for the lock.
3090 delay(retry_backoff_delay);
3091 dircheck_retry_cnt++;
3092 goto retry_lock;
3096 if (fbp) {
3097 fbrelse(fbp, S_OTHER);
3099 out:
3100 if (ip) {
3101 if (ip != target) {
3102 rw_exit(&ip->i_rwlock);
3103 VN_RELE(ITOV(ip));
3106 return (err);
3110 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
3112 offset_t off;
3113 struct dirtemplate dbuf;
3114 struct direct *dp = (struct direct *)&dbuf;
3115 int err, count;
3116 int empty = 1; /* Assume it's empty */
3117 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3119 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3121 ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
3122 for (off = 0; off < ip->i_size; off += dp->d_reclen) {
3123 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
3124 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
3126 * Since we read MINDIRSIZ, residual must
3127 * be 0 unless we're at end of file.
3130 if (err || count != 0 || dp->d_reclen == 0) {
3131 empty = 0;
3132 break;
3134 /* skip empty entries */
3135 if (dp->d_ino == 0)
3136 continue;
3138 * At this point d_namlen must be 1 or 2.
3139 * 1 implies ".", 2 implies ".." if second
3140 * char is also "."
3143 if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
3144 (ino_t)dp->d_ino == parentino)
3145 continue;
3147 if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
3148 dp->d_name[1] == '.') {
3149 continue;
3151 empty = 0;
3152 break;
3154 return (empty);
3159 * Allocate and initialize a new shadow inode to contain extended attributes.
3162 ufs_xattrmkdir(
3163 struct inode *tdp,
3164 struct inode **ipp,
3165 int flags,
3166 struct cred *cr)
3168 struct inode *ip;
3169 struct vattr va;
3170 int err;
3171 int retry = 1;
3172 struct ufsvfs *ufsvfsp;
3173 struct ulockfs *ulp;
3174 int issync;
3175 int trans_size;
3176 int dorwlock; /* 0 = not yet taken, */
3177 /* 1 = taken outside the transaction, */
3178 /* 2 = taken inside the transaction */
3181 * Validate permission to create attribute directory
3184 if ((err = ufs_iaccess(tdp, IWRITE, cr, 1)) != 0) {
3185 return (err);
3188 if (vn_is_readonly(ITOV(tdp)))
3189 return (EROFS);
3192 * No need to re-init err after again:, since it's set before
3193 * the next use of it.
3195 again:
3196 dorwlock = 0;
3197 va.va_type = VDIR;
3198 va.va_uid = tdp->i_uid;
3199 va.va_gid = tdp->i_gid;
3201 if ((tdp->i_mode & IFMT) == IFDIR) {
3202 va.va_mode = (o_mode_t)IFATTRDIR;
3203 va.va_mode |= tdp->i_mode & 0777;
3204 } else {
3205 va.va_mode = (o_mode_t)IFATTRDIR|0700;
3206 if (tdp->i_mode & 0040)
3207 va.va_mode |= 0750;
3208 if (tdp->i_mode & 0004)
3209 va.va_mode |= 0705;
3211 va.va_mask = AT_TYPE|AT_MODE;
3213 ufsvfsp = tdp->i_ufsvfs;
3215 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3216 if (err)
3217 return (err);
3220 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3221 * This follows the protocol for read()/write().
3223 if (ITOV(tdp)->v_type != VDIR) {
3224 rw_enter(&tdp->i_rwlock, RW_WRITER);
3225 dorwlock = 1;
3228 if (ulp) {
3229 trans_size = (int)TOP_MKDIR_SIZE(tdp);
3230 TRANS_BEGIN_CSYNC(ufsvfsp, &issync, TOP_MKDIR, trans_size);
3234 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3235 * This follows the protocol established by
3236 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3238 if (dorwlock == 0) {
3239 rw_enter(&tdp->i_rwlock, RW_WRITER);
3240 dorwlock = 2;
3242 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3243 rw_enter(&tdp->i_contents, RW_WRITER);
3246 * Suppress out of inodes messages if we will retry.
3248 if (retry)
3249 tdp->i_flag |= IQUIET;
3250 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
3251 tdp->i_flag &= ~IQUIET;
3253 if (err)
3254 goto fail;
3256 if (flags) {
3259 * Now attach it to src file.
3262 tdp->i_oeftflag = ip->i_number;
3265 ip->i_cflags |= IXATTR;
3266 ITOV(ip)->v_flag |= V_XATTRDIR;
3267 TRANS_INODE(ufsvfsp, tdp);
3268 tdp->i_flag |= ICHG | IUPD;
3269 tdp->i_seq++;
3270 ufs_iupdat(tdp, I_SYNC);
3271 rw_exit(&tdp->i_contents);
3272 rw_exit(&ufsvfsp->vfs_dqrwlock);
3274 rw_enter(&ip->i_rwlock, RW_WRITER);
3275 rw_enter(&ip->i_contents, RW_WRITER);
3276 TRANS_INODE(ufsvfsp, ip);
3277 ip->i_flag |= ICHG| IUPD;
3278 ip->i_seq++;
3279 ufs_iupdat(ip, I_SYNC);
3280 rw_exit(&ip->i_contents);
3281 rw_exit(&ip->i_rwlock);
3282 if (dorwlock == 2)
3283 rw_exit(&tdp->i_rwlock);
3284 if (ulp) {
3285 int terr = 0;
3287 TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_MKDIR, trans_size);
3288 ufs_lockfs_end(ulp);
3289 if (err == 0)
3290 err = terr;
3292 if (dorwlock == 1)
3293 rw_exit(&tdp->i_rwlock);
3294 *ipp = ip;
3295 return (err);
3297 fail:
3298 rw_exit(&tdp->i_contents);
3299 rw_exit(&ufsvfsp->vfs_dqrwlock);
3300 if (dorwlock == 2)
3301 rw_exit(&tdp->i_rwlock);
3302 if (ulp) {
3303 TRANS_END_CSYNC(ufsvfsp, &err, issync, TOP_MKDIR, trans_size);
3304 ufs_lockfs_end(ulp);
3306 if (dorwlock == 1)
3307 rw_exit(&tdp->i_rwlock);
3308 if (ip != NULL)
3309 VN_RELE(ITOV(ip));
3312 * No inodes? See if any are tied up in pending deletions.
3313 * This has to be done outside of any of the above, because
3314 * the draining operation can't be done from inside a transaction.
3316 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3317 ufs_delete_drain_wait(ufsvfsp, 1);
3318 retry = 0;
3319 goto again;
3322 return (err);
3326 * clear the dotdot directory entry.
3327 * Used by ufs_dirscan when clr_dotdot
3328 * flag is set and we're deleting a
3329 * directory.
3331 static int
3332 ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
3334 struct fbuf *fbp;
3335 struct direct *dotp, *dotdotp;
3336 int err = 0;
3338 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
3339 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3340 err = blkatoff(ip, 0, NULL, &fbp);
3341 if (err) {
3342 return (err);
3345 dotp = (struct direct *)fbp->fb_addr;
3346 if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
3347 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
3348 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
3349 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
3350 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {
3352 dotp->d_reclen += dotdotp->d_reclen;
3353 if (parentino == dotdotp->d_ino) {
3354 dotdotp->d_ino = 0;
3355 dotdotp->d_namlen = 0;
3356 dotdotp->d_reclen = 0;
3359 err = TRANS_DIR(ip, 0);
3360 if (err) {
3361 fbrelse(fbp, S_OTHER);
3362 } else {
3363 err = ufs_fbwrite(fbp, ip);
3366 } else {
3367 err = -1;
3369 return (err);