4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
39 * Directory manipulation routines.
41 * When manipulating directories, the i_rwlock provides serialization
42 * since directories cannot be mmapped. The i_contents lock is redundant.
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signal.h>
55 #include <sys/vnode.h>
61 #include <sys/fs/ufs_inode.h>
62 #include <sys/fs/ufs_fs.h>
63 #include <sys/mount.h>
64 #include <sys/fs/ufs_fsdir.h>
65 #include <sys/fs/ufs_trans.h>
66 #include <sys/fs/ufs_panic.h>
67 #include <sys/fs/ufs_quota.h>
68 #include <sys/errno.h>
69 #include <sys/debug.h>
71 #include <sys/sysmacros.h>
72 #include <sys/cmn_err.h>
73 #include <sys/cpuvar.h>
74 #include <sys/unistd.h>
75 #include <sys/policy.h>
78 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
81 #error "DIRBLKSIZ not a power of 2"
87 static struct dirtemplate mastertemplate
= {
89 0, DIRBLKSIZ
- 12, 2, ".."
92 #define LDIRSIZ(len) \
93 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
94 #define MAX_DIR_NAME_LEN(len) \
95 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)
98 * The dnlc directory cache allows a 64 bit handle for directory entries.
99 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
100 * into the handle. Note, a 32 bit offset allows a 4GB directory, which
101 * is way beyond what could be cached in memory by the directory
102 * caching routines. So we are quite safe with this limit.
103 * The macros below pack and unpack the handle.
105 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
106 #define H_TO_OFF(h) (off_t)((h) >> 32)
107 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))
110 * The average size of a typical on disk directory entry is about 16 bytes
111 * and so defines AV_DIRECT_SHIFT : log2(16)
112 * This define is only used to approximate the number of entries
113 * is a directory. This is needed for dnlc_dir_start() which will immediately
114 * return an error if the value is not within its acceptable range of
115 * number of files in a directory.
117 #define AV_DIRECT_SHIFT 4
119 * If the directory size (from i_size) is greater than the ufs_min_dir_cache
120 * tunable then we request dnlc directory caching.
121 * This has found to be profitable after 1024 file names.
123 int ufs_min_dir_cache
= 1024 << AV_DIRECT_SHIFT
;
125 /* The time point the dnlc directory caching was disabled */
126 static hrtime_t ufs_dc_disable_at
;
127 /* directory caching disable duration */
128 static hrtime_t ufs_dc_disable_duration
= (hrtime_t
)NANOSEC
* 5;
135 int ufs_negative_cache
= 1;
136 uint64_t ufs_dirremove_retry_cnt
;
138 static void dirbad();
139 static int ufs_dirrename();
140 static int ufs_diraddentry();
141 static int ufs_dirempty();
142 static int ufs_dirscan();
143 static int ufs_dirclrdotdot();
144 static int ufs_dirfixdotdot();
145 static int ufs_dirpurgedotdot();
146 static int dirprepareentry();
147 static int ufs_dirmakedirect();
148 static int dirbadname();
149 static int dirmangled();
152 * Check accessibility of directory against inquired mode and type.
153 * Execute access is required to search the directory.
154 * Access for write is interpreted as allowing
155 * deletion of files in the directory.
156 * Note, the reader i_contents lock will be acquired in
160 ufs_diraccess(struct inode
*ip
, int mode
, struct cred
*cr
)
162 if (((ip
->i_mode
& IFMT
) != IFDIR
) &&
163 ((ip
->i_mode
& IFMT
) != IFATTRDIR
))
166 return (ufs_iaccess(ip
, mode
, cr
, 1));
170 * Look for a given name in a directory. On successful return, *ipp
171 * will point to the VN_HELD inode.
172 * The caller is responsible for checking accessibility upfront
173 * via ufs_diraccess().
181 int skipdnlc
, /* skip the 1st level dnlc */
182 int skipcaching
) /* force directory caching off */
185 struct fbuf
*fbp
; /* a buffer of directory entries */
186 struct direct
*ep
; /* the current directory entry */
188 struct vnode
*dvp
; /* directory vnode ptr */
191 off_t endsearch
; /* offset to end directory search */
193 off_t start_off
; /* starting offset from middle search */
194 off_t last_offset
; /* last offset */
195 int entryoffsetinblock
; /* offset of ep in addr's buffer */
196 int numdirpasses
; /* strategy for directory search */
197 int namlen
; /* length of name */
203 ino_t ep_ino
; /* entry i number */
205 ushort_t ep_reclen
; /* direct local d_reclen */
207 ASSERT(*namep
!= '\0'); /* All callers ensure *namep is non null */
210 ulp
= &dp
->i_ufsvfs
->vfs_ulockfs
;
213 * Check the directory name lookup cache, first for individual files
214 * then for complete directories.
217 if (!skipdnlc
&& (vp
= dnlc_lookup(dvp
, namep
))) {
218 /* vp is already held from dnlc_lookup */
219 if (vp
== DNLC_NO_VNODE
) {
227 dcap
= &dp
->i_danchor
;
230 * Grab the reader lock on the directory data before checking
231 * the dnlc to avoid a race with ufs_dirremove() & friends.
233 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
234 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
235 * possible, retries the operation.
237 indeadlock
= ufs_tryirwlock(ulp
, &dp
->i_rwlock
, RW_READER
);
241 switch (dnlc_dir_lookup(dcap
, namep
, &handle
)) {
243 ep_ino
= (ino_t
)H_TO_INO(handle
);
244 if (dp
->i_number
== ep_ino
) {
245 VN_HOLD(dvp
); /* want ourself, "." */
247 rw_exit(&dp
->i_rwlock
);
250 if (namep
[0] == '.' && namep
[1] == '.' && namep
[2] == 0) {
253 * release the lock on the dir we are searching
254 * to avoid a deadlock when grabbing the
255 * i_contents lock in ufs_iget_alloced().
257 rw_exit(&dp
->i_rwlock
);
258 rw_enter(&dp
->i_ufsvfs
->vfs_dqrwlock
, RW_READER
);
259 err
= ufs_iget_alloced(dp
->i_vfs
, ep_ino
, ipp
, cr
);
260 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
262 * must recheck as we dropped dp->i_rwlock
264 indeadlock
= ufs_tryirwlock(ulp
, &dp
->i_rwlock
,
271 if (!err
&& (dnlc_dir_lookup(dcap
, namep
, &handle2
)
272 == DFOUND
) && (handle
== handle2
)) {
273 dnlc_update(dvp
, namep
, ITOV(*ipp
));
274 rw_exit(&dp
->i_rwlock
);
277 /* check failed, read the actual directory */
283 /* usual case of not "." nor ".." */
284 rw_enter(&dp
->i_ufsvfs
->vfs_dqrwlock
, RW_READER
);
285 err
= ufs_iget_alloced(dp
->i_vfs
, ep_ino
, ipp
, cr
);
286 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
288 rw_exit(&dp
->i_rwlock
);
291 dnlc_update(dvp
, namep
, ITOV(*ipp
));
292 rw_exit(&dp
->i_rwlock
);
295 if (ufs_negative_cache
&& (dp
->i_nlink
> 0)) {
296 dnlc_enter(dvp
, namep
, DNLC_NO_VNODE
);
298 rw_exit(&dp
->i_rwlock
);
311 * Attempt to cache any directories greater than the tunable
312 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
313 * disable caching for this directory and record the system time.
314 * Any attempt after the disable time has expired will enable
317 if (!skipcaching
&& (dp
->i_size
>= ufs_min_dir_cache
)) {
319 * if the directory caching disable time has expired
320 * enable the caching again.
322 if (dp
->i_cachedir
== CD_DISABLED_NOMEM
&&
323 gethrtime() - ufs_dc_disable_at
> ufs_dc_disable_duration
) {
324 ufs_dc_disable_at
= 0;
325 dp
->i_cachedir
= CD_ENABLED
;
327 if (dp
->i_cachedir
== CD_ENABLED
) {
328 switch (dnlc_dir_start(dcap
, dp
->i_size
>>
331 dp
->i_cachedir
= CD_DISABLED_NOMEM
;
332 ufs_dc_disable_at
= gethrtime();
335 dp
->i_cachedir
= CD_DISABLED_TOOBIG
;
346 * If caching we don't stop when the file has been
347 * found, but need to know later, so clear *ipp now
354 entryoffsetinblock
= 0;
358 * Take care to look at dp->i_diroff only once, as it
359 * may be changing due to other threads/cpus.
361 offset
= dp
->i_diroff
;
362 if (offset
> dp
->i_size
) {
366 entryoffsetinblock
= 0;
371 entryoffsetinblock
= blkoff(dp
->i_fs
, offset
);
372 if (entryoffsetinblock
!= 0) {
373 err
= blkatoff(dp
, offset
, (char **)0, &fbp
);
380 endsearch
= P2ROUNDUP_TYPED(dp
->i_size
, DIRBLKSIZ
, uoff_t
);
381 namlen
= strlen(namep
);
385 while (offset
< endsearch
) {
387 * If offset is on a block boundary,
388 * read the next directory block.
389 * Release previous if it exists.
391 if (blkoff(dp
->i_fs
, offset
) == 0) {
393 fbrelse(fbp
, S_OTHER
);
395 err
= blkatoff(dp
, offset
, (char **)0, &fbp
);
398 entryoffsetinblock
= 0;
402 * If the offset to the next entry is invalid or if the
403 * next entry is a zero length record or if the record
404 * length is invalid, then skip to the next directory
405 * block. Complete validation checks are done if the
406 * record length is invalid.
408 * Full validation checks are slow so they are disabled
409 * by default. Complete checks can be run by patching
410 * "dirchk" to be true.
412 * We have to check the validity of entryoffsetinblock
413 * here because it can be set to i_diroff above.
415 ep
= (struct direct
*)(fbp
->fb_addr
+ entryoffsetinblock
);
416 if ((entryoffsetinblock
& 0x3) || ep
->d_reclen
== 0 ||
417 (dirchk
|| (ep
->d_reclen
& 0x3)) &&
418 dirmangled(dp
, ep
, entryoffsetinblock
, offset
)) {
419 i
= DIRBLKSIZ
- (entryoffsetinblock
& (DIRBLKSIZ
- 1));
421 entryoffsetinblock
+= i
;
423 dnlc_dir_purge(dcap
);
429 ep_reclen
= ep
->d_reclen
;
432 * Add named entries and free space into the directory cache
438 if (ep
->d_ino
== 0) {
440 if (offset
& (DIRBLKSIZ
- 1)) {
441 dnlc_dir_purge(dcap
);
442 dp
->i_cachedir
= CD_DISABLED
;
447 * entries hold the previous offset except the
448 * 1st which holds the offset + 1
450 if (offset
& (DIRBLKSIZ
- 1)) {
455 caching
= (dnlc_dir_add_entry(dcap
, ep
->d_name
,
456 INO_OFF_TO_H(ep
->d_ino
, off2
)) == DOK
);
457 extra
= ep_reclen
- DIRSIZ(ep
);
459 if (caching
&& (extra
>= LDIRSIZ(1))) {
460 caching
= (dnlc_dir_add_space(dcap
, extra
,
461 (uint64_t)offset
) == DOK
);
466 * Check for a name match.
467 * We have the parent inode read locked with i_rwlock.
469 if (ep
->d_ino
&& ep
->d_namlen
== namlen
&&
470 *namep
== *ep
->d_name
&& /* fast chk 1st chr */
471 bcmp(namep
, ep
->d_name
, (int)ep
->d_namlen
) == 0) {
474 * We have to release the fbp early here to avoid
475 * a possible deadlock situation where we have the
476 * fbp and want the directory inode and someone doing
477 * a ufs_direnter_* has the directory inode and wants
478 * the fbp. XXX - is this still needed?
480 ep_ino
= (ino_t
)ep
->d_ino
;
482 fbrelse(fbp
, S_OTHER
);
486 * Atomic update (read lock held)
488 dp
->i_diroff
= offset
;
490 if (namlen
== 2 && namep
[0] == '.' && namep
[1] == '.') {
491 struct timeval32 omtime
;
494 dnlc_dir_purge(dcap
);
499 * if the inumber didn't change
500 * continue with already found inode.
502 if (ep_ino
== chkino
)
506 /* *ipp is nulled at restart */
511 * release the lock on the dir we are searching
512 * to avoid a deadlock when grabbing the
513 * i_contents lock in ufs_iget_alloced().
515 omtime
= dp
->i_mtime
;
516 rw_exit(&dp
->i_rwlock
);
517 rw_enter(&dp
->i_ufsvfs
->vfs_dqrwlock
,
519 err
= ufs_iget_alloced(dp
->i_vfs
, ep_ino
, ipp
,
521 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
522 indeadlock
= ufs_tryirwlock(ulp
, &dp
->i_rwlock
,
532 * Since we released the lock on the directory,
533 * we must check that the same inode is still
534 * the ".." entry for this directory.
537 if (timercmp(&omtime
, &dp
->i_mtime
, !=)) {
539 * Modification time changed on the
540 * directory, we must go check if
541 * the inumber changed for ".."
545 entryoffsetinblock
= 0;
548 * Forget directory caching
551 dnlc_dir_purge(dcap
);
556 } else if (dp
->i_number
== ep_ino
) {
557 VN_HOLD(dvp
); /* want ourself, "." */
560 dnlc_dir_purge(dcap
);
564 rw_enter(&dp
->i_ufsvfs
->vfs_dqrwlock
,
566 err
= ufs_iget_alloced(dp
->i_vfs
, ep_ino
, ipp
,
568 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
574 dnlc_update(dvp
, namep
, ITOV(*ipp
));
576 * If we are not caching then just return the entry
577 * otherwise complete loading up the cache
580 rw_exit(&dp
->i_rwlock
);
583 err
= blkatoff(dp
, offset
, (char **)0, &fbp
);
587 last_offset
= offset
;
589 entryoffsetinblock
+= ep_reclen
;
592 * If we started in the middle of the directory and failed
593 * to find our target, we must check the beginning as well.
595 if (numdirpasses
== 2) {
598 endsearch
= start_off
;
603 * If whole directory caching is on (or was originally on) then
604 * the entry may have been found.
608 if (ufs_negative_cache
&& (dp
->i_nlink
> 0)) {
609 dnlc_enter(dvp
, namep
, DNLC_NO_VNODE
);
613 dnlc_dir_complete(dcap
);
620 * err and *ipp can both be set if we were attempting to
621 * cache the directory, and we found the entry, then later
622 * while trying to complete the directory cache encountered
623 * a error (eg reading a directory sector).
630 fbrelse(fbp
, S_OTHER
);
631 rw_exit(&dp
->i_rwlock
);
633 dnlc_dir_purge(dcap
);
638 * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
642 struct inode
*tdp
, /* target directory to make entry in */
643 char *namep
, /* name of entry */
644 enum de_op op
, /* entry operation */
645 struct vattr
*vap
, /* attributes if new inode needed */
646 struct inode
**ipp
, /* return entered inode here */
647 struct cred
*cr
, /* user credentials */
648 int flags
) /* no entry exists */
650 struct inode
*tip
; /* inode of (existing) target file */
652 struct ufs_slot slot
; /* slot info to pass around */
653 int namlen
; /* length of name */
654 int err
; /* error number */
655 struct inode
*nip
; /* new inode */
656 int do_rele_nip
= 0; /* release nip */
657 int noentry
= flags
& ~IQUIET
;
658 int quiet
= flags
& IQUIET
; /* Suppress out of inodes message */
662 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
664 if (((tdp
->i_mode
& IFMT
) == IFATTRDIR
) && ((op
== DE_MKDIR
) ||
665 ((vap
->va_type
== VCHR
) || (vap
->va_type
== VBLK
) ||
666 (vap
->va_type
== VDOOR
) || (vap
->va_type
== VSOCK
) ||
667 (vap
->va_type
== VFIFO
))))
670 /* don't allow '/' characters in pathname component */
671 for (s
= namep
, namlen
= 0; *s
; s
++, namlen
++)
677 * Check accessibility of target directory.
679 if (err
= ufs_diraccess(tdp
, IEXEC
, cr
))
683 * If name is "." or ".." then if this is a create look it up
686 if (namep
[0] == '.' &&
687 (namlen
== 1 || (namlen
== 2 && namep
[1] == '.'))) {
689 * ufs_dirlook will acquire the i_rwlock
692 ulp
= &tdp
->i_ufsvfs
->vfs_ulockfs
;
693 rw_exit(&tdp
->i_rwlock
);
694 if (err
= ufs_dirlook(tdp
, namep
, ipp
, cr
, 0, 0)) {
699 * ufs_tryirwlock uses rw_tryenter and checks for
700 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock.
701 * If deadlock possible, retries the operation.
703 indeadlock
= ufs_tryirwlock(ulp
, &tdp
->i_rwlock
,
710 indeadlock
= ufs_tryirwlock(ulp
, &tdp
->i_rwlock
, RW_WRITER
);
719 * If target directory has not been removed, then we can consider
720 * allowing file to be created.
722 if (tdp
->i_nlink
<= 0) {
727 * Search for the entry. Return VN_HELD tip if found.
732 rw_enter(&tdp
->i_ufsvfs
->vfs_dqrwlock
, RW_READER
);
733 rw_enter(&tdp
->i_contents
, RW_WRITER
);
734 err
= ufs_dircheckforname(tdp
, namep
, namlen
, &slot
, &tip
, cr
, noentry
);
743 * The entry does not exist. Check write permission in
744 * directory to see if entry can be created.
746 if (err
= ufs_iaccess(tdp
, IWRITE
, cr
, 0))
749 * Make new inode and directory entry.
751 tdp
->i_flag
|= quiet
;
752 if (err
= ufs_dirmakeinode(tdp
, &nip
, vap
, op
, cr
)) {
757 if (err
= ufs_diraddentry(tdp
, namep
, op
,
758 namlen
, &slot
, nip
, NULL
, cr
)) {
760 * Unmake the inode we just made.
762 rw_enter(&nip
->i_contents
, RW_WRITER
);
763 if (((nip
->i_mode
& IFMT
) == IFDIR
) ||
764 ((nip
->i_mode
& IFMT
) == IFATTRDIR
)) {
769 TRANS_INODE(tdp
->i_ufsvfs
, tdp
);
774 TRANS_INODE(nip
->i_ufsvfs
, nip
);
778 rw_exit(&nip
->i_contents
);
787 fbrelse(slot
.fbp
, S_OTHER
);
789 tdp
->i_flag
&= ~quiet
;
790 rw_exit(&tdp
->i_contents
);
793 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
794 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
796 rw_exit(&tdp
->i_ufsvfs
->vfs_dqrwlock
);
806 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
810 struct inode
*tdp
, /* target directory to make entry in */
811 char *namep
, /* name of entry */
812 enum de_op op
, /* entry operation */
813 struct inode
*sdp
, /* source inode parent if rename */
814 struct inode
*sip
, /* source inode */
815 struct cred
*cr
) /* user credentials */
817 struct inode
*tip
; /* inode of (existing) target file */
819 struct ufs_slot slot
; /* slot info to pass around */
820 int namlen
; /* length of name */
821 int err
; /* error number */
823 /* don't allow '/' characters in pathname component */
824 for (s
= namep
, namlen
= 0; *s
; s
++, namlen
++)
828 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
831 * If name is "." or ".." then if this is a create look it up
832 * and return EEXIST. Rename or link TO "." or ".." is forbidden.
834 if (namep
[0] == '.' &&
835 (namlen
== 1 || (namlen
== 2 && namep
[1] == '.'))) {
836 if (op
== DE_RENAME
) {
837 return (EINVAL
); /* *SIGH* should be ENOTEMPTY */
842 * For link and rename lock the source entry and check the link count
843 * to see if it has been removed while it was unlocked. If not, we
844 * increment the link count and force the inode to disk to make sure
845 * that it is there before any directory entry that points to it.
847 * In the case of a symbolic link, we are dealing with a new inode
848 * which does not yet have any links. We've created it with a link
849 * count of 1, and we don't want to increment it since this will be
852 * We are about to push the inode to disk. We make sure
853 * that the inode's data blocks are flushed first so the
854 * inode and it's data blocks are always in sync. This
855 * adds some robustness in in the event of a power failure
856 * or panic where sync fails. If we panic before the
857 * inode is updated, then the inode still refers to the
858 * old data blocks (or none for a new file). If we panic
859 * after the inode is updated, then the inode refers to
860 * the new data blocks.
862 * We do this before grabbing the i_contents lock because
863 * ufs_syncip() will want that lock. We could do the data
864 * syncing after the removal checks, but upon return from
865 * the data sync we would have to repeat the removal
868 if (err
= TRANS_SYNCIP(sip
, 0, I_DSYNC
, TOP_FSYNC
)) {
872 rw_enter(&sip
->i_contents
, RW_WRITER
);
873 if (sip
->i_nlink
<= 0) {
874 rw_exit(&sip
->i_contents
);
877 if (sip
->i_nlink
== MAXLINK
) {
878 rw_exit(&sip
->i_contents
);
883 * Sync the indirect blocks associated with the file
884 * for the same reasons as described above. Since this
885 * call wants the i_contents lock held for it we can do
886 * this here with no extra work.
888 if (err
= ufs_sync_indir(sip
)) {
889 rw_exit(&sip
->i_contents
);
893 if (op
!= DE_SYMLINK
)
895 TRANS_INODE(sip
->i_ufsvfs
, sip
);
898 ufs_iupdat(sip
, I_SYNC
);
899 rw_exit(&sip
->i_contents
);
902 * If target directory has not been removed, then we can consider
903 * allowing file to be created.
905 if (tdp
->i_nlink
<= 0) {
911 * Check accessibility of target directory.
913 if (err
= ufs_diraccess(tdp
, IEXEC
, cr
))
917 * Search for the entry. Return VN_HELD tip if found.
922 rw_enter(&tdp
->i_ufsvfs
->vfs_dqrwlock
, RW_READER
);
923 rw_enter(&tdp
->i_contents
, RW_WRITER
);
924 err
= ufs_dircheckforname(tdp
, namep
, namlen
, &slot
, &tip
, cr
, 0);
931 err
= ufs_dirrename(sdp
, sip
, tdp
, namep
,
938 * Can't link to an existing file.
947 * The entry does not exist. Check write permission in
948 * directory to see if entry can be created.
950 if (err
= ufs_iaccess(tdp
, IWRITE
, cr
, 0))
952 err
= ufs_diraddentry(tdp
, namep
, op
, namlen
, &slot
, sip
, sdp
,
958 fbrelse(slot
.fbp
, S_OTHER
);
960 rw_exit(&tdp
->i_contents
);
963 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
964 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
966 rw_exit(&tdp
->i_ufsvfs
->vfs_dqrwlock
);
969 * If we renamed a file over the top of an existing file,
970 * or linked a file to an existing file (or tried to),
971 * then release and delete (or just release) the inode.
979 * Undo bumped link count.
981 if (op
!= DE_SYMLINK
) {
982 rw_enter(&sip
->i_contents
, RW_WRITER
);
985 TRANS_INODE(sip
->i_ufsvfs
, sip
);
989 rw_exit(&sip
->i_contents
);
996 * Check for the existence of a name in a directory (unless noentry
997 * is set) , or else of an empty
998 * slot in which an entry may be made. If the requested name is found,
999 * then on return *ipp points at the inode and *offp contains
1000 * its offset in the directory. If the name is not found, then *ipp
1001 * will be NULL and *slotp will contain information about a directory slot in
1002 * which an entry may be made (either an empty slot, or the first position
1003 * past the end of the directory).
1004 * The target directory inode (tdp) is supplied write locked (i_rwlock).
1006 * This may not be used on "." or "..", but aliases of "." are ok.
1009 ufs_dircheckforname(
1010 struct inode
*tdp
, /* inode of directory being checked */
1011 char *namep
, /* name we're checking for */
1012 int namlen
, /* length of name, excluding null */
1013 struct ufs_slot
*slotp
, /* slot structure */
1014 struct inode
**ipp
, /* return inode if we find one */
1016 int noentry
) /* noentry - just look for space */
1019 struct fbuf
*fbp
; /* pointer to directory block */
1020 struct direct
*ep
; /* directory entry */
1021 struct direct
*nep
; /* next directory entry */
1023 vnode_t
*dvp
; /* directory vnode ptr */
1024 off_t dirsize
; /* size of the directory */
1025 off_t offset
; /* offset in the directory */
1026 off_t last_offset
; /* last offset */
1027 off_t enduseful
; /* pointer past last used dir slot */
1028 int entryoffsetinblk
; /* offset of ep in fbp's buffer */
1029 int i
; /* length of mangled entry */
1036 slotstat_t initstat
= slotp
->status
;
1038 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
1039 ASSERT(RW_WRITE_HELD(&tdp
->i_contents
));
1040 ASSERT(*ipp
== NULL
);
1044 * First check if there is a complete cache of the directory.
1048 dcap
= &tdp
->i_danchor
;
1051 * We know from the 1st level dnlc cache that the entry
1052 * doesn't exist, so don't bother searching the directory
1053 * cache, but just look for space (possibly in the directory
1058 stat
= dnlc_dir_lookup(dcap
, namep
, &handle
);
1062 ep_ino
= (ino_t
)H_TO_INO(handle
);
1063 if (tdp
->i_number
== ep_ino
) {
1064 *ipp
= tdp
; /* we want ourself, ie "." */
1067 err
= ufs_iget_alloced(tdp
->i_vfs
, ep_ino
, ipp
, cr
);
1071 offset
= H_TO_OFF(handle
);
1074 /* This is the first entry in the block */
1077 ASSERT((offset
& (DIRBLKSIZ
- 1)) == 0);
1079 err
= blkatoff(tdp
, offset
, (char **)&ep
, &fbp
);
1081 VN_RELE(ITOV(*ipp
));
1086 * Check the validity of the entry.
1087 * If it's bad, then throw away the cache and
1088 * continue without it. The dirmangled() routine
1089 * will then be called upon it.
1091 if ((ep
->d_reclen
== 0) || (ep
->d_reclen
& 0x3)) {
1092 VN_RELE(ITOV(*ipp
));
1094 dnlc_dir_purge(dcap
);
1098 * Remember the returned offset is the offset of the
1099 * preceding record (unless this is the 1st record
1100 * in the DIRBLKSIZ sized block (disk sector)), then it's
1101 * offset + 1. Note, no real offsets are on odd boundaries.
1104 ASSERT((offset
& (DIRBLKSIZ
- 1)) == 0);
1105 slotp
->offset
= offset
;
1109 /* get the next entry */
1110 nep
= (struct direct
*)((char *)ep
+ ep
->d_reclen
);
1112 * Check the validity of this entry as well
1113 * If it's bad, then throw away the cache and
1114 * continue without it. The dirmangled() routine
1115 * will then be called upon it.
1117 if ((nep
->d_reclen
== 0) || (nep
->d_reclen
& 0x3) ||
1118 (nep
->d_ino
!= ep_ino
)) {
1119 VN_RELE(ITOV(*ipp
));
1121 dnlc_dir_purge(dcap
);
1124 slotp
->offset
= offset
+ ep
->d_reclen
;
1125 slotp
->size
= ep
->d_reclen
;
1128 slotp
->status
= EXIST
;
1132 dnlc_update(dvp
, namep
, ITOV(*ipp
));
1136 * The caller gets to set the initial slot status to
1137 * indicate whether it's interested in getting a
1138 * empty slot. For example, the status can be set
1139 * to FOUND when an entry is being deleted.
1141 ASSERT(slotp
->fbp
== NULL
);
1142 if (slotp
->status
== FOUND
) {
1145 switch (dnlc_dir_rem_space_by_len(dcap
, LDIRSIZ(namlen
),
1148 offset
= (off_t
)handle
;
1149 err
= blkatoff(tdp
, offset
, (char **)&ep
, &fbp
);
1151 dnlc_dir_purge(dcap
);
1152 ASSERT(*ipp
== NULL
);
1156 * Check the validity of the entry.
1157 * If it's bad, then throw away the cache and
1158 * continue without it. The dirmangled() routine
1159 * will then be called upon it.
1161 if ((ep
->d_reclen
== 0) || (ep
->d_reclen
& 0x3)) {
1162 dnlc_dir_purge(dcap
);
1166 * Remember the returned offset is the offset of the
1167 * containing record.
1169 slotp
->status
= FOUND
;
1171 slotp
->offset
= offset
;
1173 slotp
->size
= ep
->d_reclen
;
1175 * Set end offset to 0. Truncation is handled
1176 * because the dnlc cache will blow away the
1177 * cached directory when an entry is removed
1178 * that drops the entries left to less than half
1179 * the minumum number (dnlc_min_dir_cache).
1185 slotp
->status
= NONE
;
1186 slotp
->offset
= P2ROUNDUP_TYPED(tdp
->i_size
,
1188 slotp
->size
= DIRBLKSIZ
;
1199 if (!noentry
&& tdp
->i_size
>= ufs_min_dir_cache
) {
1201 * if the directory caching disable time has expired
1202 * enable caching again.
1204 if (tdp
->i_cachedir
== CD_DISABLED_NOMEM
&&
1205 gethrtime() - ufs_dc_disable_at
> ufs_dc_disable_duration
) {
1206 ufs_dc_disable_at
= 0;
1207 tdp
->i_cachedir
= CD_ENABLED
;
1210 * Attempt to cache any directories greater than the tunable
1211 * ufs_min_cache_dir. If it fails due to memory shortage
1212 * (DNOMEM), disable caching for this directory and record
1213 * the system time. Any attempt after the disable time has
1214 * expired will enable the caching again.
1216 if (tdp
->i_cachedir
== CD_ENABLED
) {
1217 switch (dnlc_dir_start(dcap
,
1218 tdp
->i_size
>> AV_DIRECT_SHIFT
)) {
1220 tdp
->i_cachedir
= CD_DISABLED_NOMEM
;
1221 ufs_dc_disable_at
= gethrtime();
1224 tdp
->i_cachedir
= CD_DISABLED_TOOBIG
;
1236 * No point in using i_diroff since we must search whole directory
1238 dirsize
= P2ROUNDUP_TYPED(tdp
->i_size
, DIRBLKSIZ
, uoff_t
);
1240 offset
= last_offset
= 0;
1241 entryoffsetinblk
= 0;
1242 needed
= (int)LDIRSIZ(namlen
);
1243 while (offset
< dirsize
) {
1245 * If offset is on a block boundary,
1246 * read the next directory block.
1247 * Release previous if it exists.
1249 if (blkoff(tdp
->i_fs
, offset
) == 0) {
1251 fbrelse(fbp
, S_OTHER
);
1253 err
= blkatoff(tdp
, offset
, (char **)0, &fbp
);
1255 ASSERT(*ipp
== NULL
);
1257 dnlc_dir_purge(dcap
);
1261 entryoffsetinblk
= 0;
1264 * If still looking for a slot, and at a DIRBLKSIZ
1265 * boundary, have to start looking for free space
1268 if (slotp
->status
== NONE
&&
1269 (entryoffsetinblk
& (DIRBLKSIZ
- 1)) == 0) {
1273 * If the next entry is a zero length record or if the
1274 * record length is invalid, then skip to the next
1275 * directory block. Complete validation checks are
1276 * done if the record length is invalid.
1278 * Full validation checks are slow so they are disabled
1279 * by default. Complete checks can be run by patching
1280 * "dirchk" to be true.
1282 * We do not have to check the validity of
1283 * entryoffsetinblk here because it starts out as zero
1284 * and is only incremented by d_reclen values that we
1287 ep
= (struct direct
*)(fbp
->fb_addr
+ entryoffsetinblk
);
1288 if (ep
->d_reclen
== 0 ||
1289 (dirchk
|| (ep
->d_reclen
& 0x3)) &&
1290 dirmangled(tdp
, ep
, entryoffsetinblk
, offset
)) {
1291 i
= DIRBLKSIZ
- (entryoffsetinblk
& (DIRBLKSIZ
- 1));
1293 entryoffsetinblk
+= i
;
1295 dnlc_dir_purge(dcap
);
1302 * Add named entries and free space into the directory cache
1308 if (ep
->d_ino
== 0) {
1309 extra
= ep
->d_reclen
;
1310 if (offset
& (DIRBLKSIZ
- 1)) {
1311 dnlc_dir_purge(dcap
);
1316 * entries hold the previous offset if
1319 if (offset
& (DIRBLKSIZ
- 1)) {
1324 caching
= (dnlc_dir_add_entry(dcap
, ep
->d_name
,
1325 INO_OFF_TO_H(ep
->d_ino
, off2
)) == DOK
);
1326 extra
= ep
->d_reclen
- DIRSIZ(ep
);
1328 if (caching
&& (extra
>= LDIRSIZ(1))) {
1329 caching
= (dnlc_dir_add_space(dcap
, extra
,
1330 (uint64_t)offset
) == DOK
);
1335 * If an appropriate sized slot has not yet been found,
1336 * check to see if one is available.
1338 if ((slotp
->status
!= FOUND
) && (slotp
->status
!= EXIST
)) {
1339 int size
= ep
->d_reclen
;
1344 if (size
>= needed
) {
1345 slotp
->offset
= offset
;
1346 slotp
->size
= ep
->d_reclen
;
1350 slotp
->status
= FOUND
;
1354 slotp
->status
= FOUND
;
1355 } else if (slotp
->status
== NONE
) {
1356 if (slotp
->offset
== -1)
1357 slotp
->offset
= offset
;
1362 * Check for a name match.
1364 if (ep
->d_ino
&& ep
->d_namlen
== namlen
&&
1365 *namep
== *ep
->d_name
&& /* fast chk 1st char */
1366 bcmp(namep
, ep
->d_name
, namlen
) == 0) {
1368 tdp
->i_diroff
= offset
;
1370 if (tdp
->i_number
== ep
->d_ino
) {
1371 *ipp
= tdp
; /* we want ourself, ie "." */
1374 err
= ufs_iget_alloced(tdp
->i_vfs
,
1375 (ino_t
)ep
->d_ino
, ipp
, cr
);
1377 fbrelse(fbp
, S_OTHER
);
1379 dnlc_dir_purge(dcap
);
1383 slotp
->status
= EXIST
;
1384 slotp
->offset
= offset
;
1385 slotp
->size
= (int)(offset
- last_offset
);
1390 dnlc_dir_purge(dcap
);
1393 last_offset
= offset
;
1394 offset
+= ep
->d_reclen
;
1395 entryoffsetinblk
+= ep
->d_reclen
;
1400 fbrelse(fbp
, S_OTHER
);
1404 dnlc_dir_complete(dcap
);
1406 if (slotp
->status
== FOUND
) {
1407 if (initstat
== FOUND
) {
1410 (void) dnlc_dir_rem_space_by_handle(dcap
,
1417 if (slotp
->status
== NONE
) {
1419 * We didn't find a slot; the new directory entry should be put
1420 * at the end of the directory. Return an indication of where
1421 * this is, and set "endoff" to zero; since we're going to have
1422 * to extend the directory, we're certainly not going to
1425 slotp
->offset
= dirsize
;
1426 slotp
->size
= DIRBLKSIZ
;
1430 * We found a slot, and will return an indication of where that
1431 * slot is, as any new directory entry will be put there.
1432 * Since that slot will become a useful entry, if the last
1433 * useful entry we found was before this one, update the offset
1434 * of the last useful entry.
1436 if (enduseful
< slotp
->offset
+ slotp
->size
)
1437 enduseful
= slotp
->offset
+ slotp
->size
;
1438 slotp
->endoff
= P2ROUNDUP_TYPED(enduseful
, DIRBLKSIZ
, off_t
);
1444 uint64_t ufs_dirrename_retry_cnt
;
1447 * Rename the entry in the directory tdp so that it points to
1448 * sip instead of tip.
1452 struct inode
*sdp
, /* parent directory of source */
1453 struct inode
*sip
, /* source inode */
1454 struct inode
*tdp
, /* parent directory of target */
1455 char *namep
, /* entry we are trying to change */
1456 struct inode
*tip
, /* target inode */
1457 struct ufs_slot
*slotp
, /* slot for entry */
1458 struct cred
*cr
) /* credentials */
1465 ASSERT(sdp
->i_ufsvfs
!= NULL
);
1466 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
1467 ASSERT(RW_WRITE_HELD(&tdp
->i_contents
));
1469 * Short circuit rename of something to itself.
1471 if (sip
->i_number
== tip
->i_number
) {
1472 return (ESAME
); /* special KLUDGE error code */
1476 * We're locking 2 peer level locks, so must use tryenter
1477 * on the 2nd to avoid deadlocks that would occur
1478 * if we renamed a->b and b->a concurrently.
1481 rw_enter(&tip
->i_contents
, RW_WRITER
);
1482 if (!rw_tryenter(&sip
->i_contents
, RW_READER
)) {
1484 * drop tip and wait (sleep) until we stand a chance
1487 rw_exit(&tip
->i_contents
);
1488 rw_enter(&sip
->i_contents
, RW_READER
);
1490 * Reverse the lock grabs in case we have heavy
1491 * contention on the 2nd lock.
1493 if (!rw_tryenter(&tip
->i_contents
, RW_WRITER
)) {
1494 ufs_dirrename_retry_cnt
++;
1495 rw_exit(&sip
->i_contents
);
1501 * Check that everything is on the same filesystem.
1503 if ((ITOV(tip
)->v_vfsp
!= ITOV(tdp
)->v_vfsp
) ||
1504 (ITOV(tip
)->v_vfsp
!= ITOV(sip
)->v_vfsp
)) {
1505 err
= EXDEV
; /* XXX archaic */
1509 * Must have write permission to rewrite target entry.
1510 * Perform additional checks for sticky directories.
1512 if ((err
= ufs_iaccess(tdp
, IWRITE
, cr
, 0)) != 0 ||
1513 (err
= ufs_sticky_remove_access(tdp
, tip
, cr
)) != 0)
1517 * Ensure source and target are compatible (both directories
1518 * or both not directories). If target is a directory it must
1519 * be empty and have no links to it; in addition it must not
1520 * be a mount point, and both the source and target must be
1523 doingdirectory
= (((sip
->i_mode
& IFMT
) == IFDIR
) ||
1524 ((sip
->i_mode
& IFMT
) == IFATTRDIR
));
1525 if (((tip
->i_mode
& IFMT
) == IFDIR
) ||
1526 ((tip
->i_mode
& IFMT
) == IFATTRDIR
)) {
1527 if (!doingdirectory
) {
1532 * vn_vfsrlock will prevent mounts from using the directory
1533 * until we are done.
1535 if (vn_vfsrlock(ITOV(tip
))) {
1539 if (vn_mountedvfs(ITOV(tip
)) != NULL
) {
1540 vn_vfsunlock(ITOV(tip
));
1544 if (!ufs_dirempty(tip
, tdp
->i_number
, cr
) || tip
->i_nlink
> 2) {
1545 vn_vfsunlock(ITOV(tip
));
1546 err
= EEXIST
; /* SIGH should be ENOTEMPTY */
1549 } else if (doingdirectory
) {
1555 * Rewrite the inode pointer for target name entry
1556 * from the target inode (ip) to the source inode (sip).
1557 * This prevents the target entry from disappearing
1558 * during a crash. Mark the directory inode to reflect the changes.
1561 slotp
->ep
->d_ino
= (int32_t)sip
->i_number
;
1562 dnlc_update(tdvp
, namep
, ITOV(sip
));
1564 offset
= slotp
->offset
- slotp
->size
;
1566 offset
= slotp
->offset
+ 1;
1568 if (slotp
->cached
) {
1569 (void) dnlc_dir_update(&tdp
->i_danchor
, namep
,
1570 INO_OFF_TO_H(slotp
->ep
->d_ino
, offset
));
1573 err
= TRANS_DIR(tdp
, slotp
->offset
);
1575 fbrelse(slotp
->fbp
, S_OTHER
);
1577 err
= ufs_fbwrite(slotp
->fbp
, tdp
);
1582 vn_vfsunlock(ITOV(tip
));
1586 TRANS_INODE(tdp
->i_ufsvfs
, tdp
);
1587 tdp
->i_flag
|= IUPD
|ICHG
;
1592 * Decrement the link count of the target inode.
1593 * Fix the ".." entry in sip to point to dp.
1594 * This is done after the new entry is on the disk.
1597 TRANS_INODE(tip
->i_ufsvfs
, tip
);
1598 tip
->i_flag
|= ICHG
;
1601 if (doingdirectory
) {
1603 * The entry for tip no longer exists so I can unlock the
1606 vn_vfsunlock(ITOV(tip
));
1608 * Decrement target link count once more if it was a directory.
1610 if (--tip
->i_nlink
!= 0) {
1611 err
= ufs_fault(ITOV(tip
),
1612 "ufs_dirrename: target directory link count != 0 (%s)",
1613 tip
->i_fs
->fs_fsmnt
);
1614 rw_exit(&tip
->i_contents
);
1617 TRANS_INODE(tip
->i_ufsvfs
, tip
);
1618 ufs_setreclaim(tip
);
1620 * Renaming a directory with the parent different
1621 * requires that ".." be rewritten. The window is
1622 * still there for ".." to be inconsistent, but this
1623 * is unavoidable, and a lot shorter than when it was
1624 * done in a user process. We decrement the link
1625 * count in the new parent as appropriate to reflect
1626 * the just-removed target. If the parent is the
1627 * same, this is appropriate since the original
1628 * directory is going away. If the new parent is
1629 * different, ufs_dirfixdotdot() will bump the link count
1633 ufs_setreclaim(tdp
);
1634 TRANS_INODE(tdp
->i_ufsvfs
, tdp
);
1635 tdp
->i_flag
|= ICHG
;
1639 rw_exit(&tip
->i_contents
);
1640 rw_exit(&sip
->i_contents
);
1641 err
= ufs_dirfixdotdot(sip
, sdp
, tdp
);
1645 ufs_setreclaim(tip
);
1647 rw_exit(&tip
->i_contents
);
1648 rw_exit(&sip
->i_contents
);
1653 * Fix the ".." entry of the child directory so that it points
1654 * to the new parent directory instead of the old one. Routine
1655 * assumes that dp is a directory and that all the inodes are on
1656 * the same file system.
1660 struct inode
*dp
, /* child directory */
1661 struct inode
*opdp
, /* old parent directory */
1662 struct inode
*npdp
) /* new parent directory */
1665 struct dirtemplate
*dirp
;
1669 ASSERT(RW_WRITE_HELD(&npdp
->i_rwlock
));
1670 ASSERT(RW_WRITE_HELD(&npdp
->i_contents
));
1673 * We hold the child directory's i_contents lock before calling
1674 * blkatoff so that we honor correct locking protocol which is
1675 * i_contents lock and then page lock. (blkatoff will call
1676 * ufs_getpage where we want the page lock)
1677 * We hold the child directory's i_rwlock before i_contents (as
1678 * per the locking protocol) since we are modifying the ".." entry
1679 * of the child directory.
1680 * We hold the i_rwlock and i_contents lock until we record
1681 * this directory delta to the log (via ufs_trans_dir) and have
1684 rw_enter(&dp
->i_rwlock
, RW_WRITER
);
1685 rw_enter(&dp
->i_contents
, RW_WRITER
);
1686 err
= blkatoff(dp
, (off_t
)0, (char **)&dirp
, &fbp
);
1690 if (dp
->i_nlink
<= 0 ||
1691 dp
->i_size
< sizeof (struct dirtemplate
)) {
1696 if (dirp
->dotdot_namlen
!= 2 ||
1697 dirp
->dotdot_name
[0] != '.' ||
1698 dirp
->dotdot_name
[1] != '.') { /* Sanity check. */
1699 dirbad(dp
, "mangled .. entry", (off_t
)0);
1705 * Increment the link count in the new parent inode and force it out.
1707 if (npdp
->i_nlink
== MAXLINK
) {
1712 TRANS_INODE(npdp
->i_ufsvfs
, npdp
);
1713 npdp
->i_flag
|= ICHG
;
1715 ufs_iupdat(npdp
, I_SYNC
);
1718 * Rewrite the child ".." entry and force it out.
1721 dirp
->dotdot_ino
= (uint32_t)npdp
->i_number
;
1722 dnlc_update(dvp
, "..", ITOV(npdp
));
1723 (void) dnlc_dir_update(&dp
->i_danchor
, "..",
1724 INO_OFF_TO_H(dirp
->dotdot_ino
, 0));
1726 err
= TRANS_DIR(dp
, 0);
1728 fbrelse(fbp
, S_OTHER
);
1730 err
= ufs_fbwrite(fbp
, dp
);
1736 rw_exit(&dp
->i_contents
);
1737 rw_exit(&dp
->i_rwlock
);
1740 * Decrement the link count of the old parent inode and force it out.
1743 rw_enter(&opdp
->i_contents
, RW_WRITER
);
1744 ASSERT(opdp
->i_nlink
> 0);
1746 ufs_setreclaim(opdp
);
1747 TRANS_INODE(opdp
->i_ufsvfs
, opdp
);
1748 opdp
->i_flag
|= ICHG
;
1750 ufs_iupdat(opdp
, I_SYNC
);
1751 rw_exit(&opdp
->i_contents
);
1756 fbrelse(fbp
, S_OTHER
);
1757 rw_exit(&dp
->i_contents
);
1758 rw_exit(&dp
->i_rwlock
);
1763 * Enter the file sip in the directory tdp with name namep.
1771 struct ufs_slot
*slotp
,
1776 struct direct
*ep
, *nep
;
1778 dcanchor_t
*dcap
= &tdp
->i_danchor
;
1783 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
1784 ASSERT(RW_WRITE_HELD(&tdp
->i_contents
));
1786 * Prepare a new entry. If the caller has not supplied an
1787 * existing inode, make a new one.
1789 err
= dirprepareentry(tdp
, slotp
, cr
);
1792 fbrelse(slotp
->fbp
, S_OTHER
);
1798 * Check inode to be linked to see if it is in the
1801 if (ITOV(tdp
)->v_vfsp
!= ITOV(sip
)->v_vfsp
) {
1807 * If renaming a directory then fix up the ".." entry in the
1808 * directory to point to the new parent.
1810 if ((op
== DE_RENAME
) && (((sip
->i_mode
& IFMT
) == IFDIR
) ||
1811 ((sip
->i_mode
& IFMT
) == IFATTRDIR
)) && (sdp
!= tdp
)) {
1812 err
= ufs_dirfixdotdot(sip
, sdp
, tdp
);
1818 * Fill in entry data.
1821 ep
->d_namlen
= (ushort_t
)namlen
;
1822 (void) strncpy(ep
->d_name
, namep
, (size_t)((namlen
+ 4) & ~3));
1823 ep
->d_ino
= (uint32_t)sip
->i_number
;
1825 dnlc_update(tdvp
, namep
, ITOV(sip
));
1827 * Note the offset supplied for any named entry is
1828 * the offset of the previous one, unless it's the 1st.
1829 * slotp->size is used to pass the length to
1830 * the previous entry.
1833 offset
= slotp
->offset
- slotp
->size
;
1835 offset
= slotp
->offset
+ 1;
1838 if (slotp
->cached
) {
1840 * Add back any usable unused space to the dnlc directory
1843 extra
= ep
->d_reclen
- DIRSIZ(ep
);
1844 if (extra
>= LDIRSIZ(1)) {
1845 (void) dnlc_dir_add_space(dcap
, extra
,
1846 (uint64_t)slotp
->offset
);
1849 (void) dnlc_dir_add_entry(dcap
, namep
,
1850 INO_OFF_TO_H(ep
->d_ino
, offset
));
1852 /* adjust the previous offset of the next entry */
1853 nep
= (struct direct
*)((char *)ep
+ ep
->d_reclen
);
1854 if ((uintptr_t)nep
& (DIRBLKSIZ
- 1)) {
1858 * Check the validity of the next entry.
1859 * If it's bad, then throw away the cache, and
1860 * continue as before directory caching.
1862 if ((nep
->d_reclen
== 0) || (nep
->d_reclen
& 0x3) ||
1863 dnlc_dir_update(dcap
, nep
->d_name
,
1864 INO_OFF_TO_H(nep
->d_ino
, slotp
->offset
))
1866 dnlc_dir_purge(dcap
);
1873 * Write out the directory block.
1875 err
= TRANS_DIR(tdp
, slotp
->offset
);
1877 fbrelse(slotp
->fbp
, S_OTHER
);
1879 err
= ufs_fbwrite(slotp
->fbp
, tdp
);
1883 * If this is a rename of a directory, then we have already
1884 * fixed the ".." entry to refer to the new parent. If err
1885 * is true at this point, we have failed to update the new
1886 * parent to refer to the renamed directory.
1887 * XXX - we need to unwind the ".." fix.
1893 * Mark the directory inode to reflect the changes.
1894 * Truncate the directory to chop off blocks of empty entries.
1897 TRANS_INODE(tdp
->i_ufsvfs
, tdp
);
1898 tdp
->i_flag
|= IUPD
|ICHG
;
1903 * If the directory grew then dirprepareentry() will have
1904 * set IATTCHG in tdp->i_flag, then the directory inode must
1905 * be flushed out. This is because if fsync() is used later
1906 * the directory size must be correct, otherwise a crash would
1907 * cause fsck to move the file to lost+found. Also because later
1908 * a file may be linked in more than one directory, then there
1909 * is no way to flush the original directory. So it must be
1910 * flushed out on creation. See bug 4293809.
1912 if (tdp
->i_flag
& IATTCHG
) {
1913 ufs_iupdat(tdp
, I_SYNC
);
1916 if (slotp
->endoff
&& (slotp
->endoff
< tdp
->i_size
)) {
1917 if (!TRANS_ISTRANS(tdp
->i_ufsvfs
)) {
1918 (void) ufs_itrunc(tdp
, (uoff_t
)slotp
->endoff
, 0,
1927 if (slotp
->cached
) {
1928 dnlc_dir_purge(dcap
);
1929 fbrelse(slotp
->fbp
, S_OTHER
);
1936 * Clear out entry prepared by dirprepareent.
1938 slotp
->ep
->d_ino
= 0;
1939 slotp
->ep
->d_namlen
= 0;
1942 * Don't touch err so we don't clobber the real error that got us here.
1944 if (TRANS_DIR(tdp
, slotp
->offset
))
1945 fbrelse(slotp
->fbp
, S_OTHER
);
1947 (void) ufs_fbwrite(slotp
->fbp
, tdp
);
1953 * Prepare a directory slot to receive an entry.
1957 struct inode
*dp
, /* directory we are working in */
1958 struct ufs_slot
*slotp
, /* available slot info */
1961 struct direct
*ep
, *nep
;
1964 slotstat_t status
= slotp
->status
;
1967 ASSERT((status
== NONE
) || (status
== FOUND
));
1968 ASSERT(RW_WRITE_HELD(&dp
->i_rwlock
));
1969 ASSERT(RW_WRITE_HELD(&dp
->i_contents
));
1971 * If we didn't find a slot, then indicate that the
1972 * new slot belongs at the end of the directory.
1973 * If we found a slot, then the new entry can be
1974 * put at slotp->offset.
1976 entryend
= slotp
->offset
+ slotp
->size
;
1977 if (status
== NONE
) {
1978 ASSERT((slotp
->offset
& (DIRBLKSIZ
- 1)) == 0);
1979 if (DIRBLKSIZ
> dp
->i_fs
->fs_fsize
) {
1980 err
= ufs_fault(ITOV(dp
),
1981 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
1982 " > dp->i_fs->fs_fsize: %d (%s)",
1983 DIRBLKSIZ
, dp
->i_fs
->fs_fsize
, dp
->i_fs
->fs_fsmnt
);
1987 * Allocate the new block.
1989 err
= BMAPALLOC(dp
, (uoff_t
)slotp
->offset
,
1990 (int)(blkoff(dp
->i_fs
, slotp
->offset
) + DIRBLKSIZ
), cr
);
1994 dp
->i_size
= entryend
;
1995 TRANS_INODE(dp
->i_ufsvfs
, dp
);
1996 dp
->i_flag
|= IUPD
|ICHG
|IATTCHG
;
1999 } else if (entryend
> dp
->i_size
) {
2001 * Adjust directory size, if needed. This should never
2002 * push the size past a new multiple of DIRBLKSIZ.
2003 * This is an artifact of the old (4.2BSD) way of initializing
2004 * directory sizes to be less than DIRBLKSIZ.
2006 dp
->i_size
= P2ROUNDUP_TYPED(entryend
, DIRBLKSIZ
, off_t
);
2007 TRANS_INODE(dp
->i_ufsvfs
, dp
);
2008 dp
->i_flag
|= IUPD
|ICHG
|IATTCHG
;
2014 * Get the block containing the space for the new directory entry.
2016 if (slotp
->fbp
== NULL
) {
2017 err
= blkatoff(dp
, slotp
->offset
, (char **)&slotp
->ep
,
2028 * No space in the directory. slotp->offset will be on a
2029 * directory block boundary and we will write the new entry
2030 * into a fresh block.
2032 ep
->d_reclen
= DIRBLKSIZ
;
2033 slotp
->size
= 0; /* length of previous entry */
2037 * An entry of the required size has been found. Use it.
2039 if (ep
->d_ino
== 0) {
2040 /* this is the 1st record in a block */
2041 slotp
->size
= 0; /* length of previous entry */
2044 nep
= (struct direct
*)((char *)ep
+ dsize
);
2045 nep
->d_reclen
= ep
->d_reclen
- dsize
;
2046 ep
->d_reclen
= dsize
;
2048 slotp
->offset
+= dsize
;
2049 slotp
->size
= dsize
; /* length of previous entry */
2059 * Allocate and initialize a new inode that will go into directory tdp.
2060 * This routine is called from ufs_symlink(), as well as within this file.
2072 int imode
; /* mode and format as in inode */
2077 ASSERT(vap
!= NULL
);
2078 ASSERT(op
== DE_CREATE
|| op
== DE_MKDIR
|| op
== DE_ATTRDIR
||
2080 ASSERT((vap
->va_mask
& (AT_TYPE
|AT_MODE
)) == (AT_TYPE
|AT_MODE
));
2081 ASSERT(RW_WRITE_HELD(&tdp
->i_rwlock
));
2082 ASSERT(RW_WRITE_HELD(&tdp
->i_contents
));
2084 * Allocate a new inode.
2086 type
= vap
->va_type
;
2088 ipref
= dirpref(tdp
);
2090 ipref
= tdp
->i_number
;
2092 if (op
== DE_ATTRDIR
)
2093 imode
= vap
->va_mode
;
2095 imode
= MAKEIMODE(type
, vap
->va_mode
);
2097 err
= ufs_ialloc(tdp
, ipref
, imode
, &ip
, cr
);
2102 * We don't need to grab vfs_dqrwlock here because it is held
2103 * in ufs_direnter_*() above us.
2105 ASSERT(RW_READ_HELD(&ip
->i_ufsvfs
->vfs_dqrwlock
));
2106 rw_enter(&ip
->i_contents
, RW_WRITER
);
2107 if (ip
->i_dquot
!= NULL
) {
2108 err
= ufs_fault(ITOV(ip
),
2109 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
2110 tdp
->i_fs
->fs_fsmnt
);
2111 rw_exit(&ip
->i_contents
);
2115 ip
->i_mode
= (o_mode_t
)imode
;
2116 if (type
== VBLK
|| type
== VCHR
) {
2117 dev_t d
= vap
->va_rdev
;
2121 * Don't allow a special file to be created with a
2122 * dev_t that cannot be represented by this filesystem
2125 if (!cmpldev(&dev32
, d
)) {
2130 ITOV(ip
)->v_rdev
= ip
->i_rdev
= d
;
2132 if (dev32
& ~((O_MAXMAJ
<< L_BITSMINOR32
) | O_MAXMIN
)) {
2133 ip
->i_ordev
= dev32
; /* can't use old format */
2135 ip
->i_ordev
= cmpdev(d
);
2138 ITOV(ip
)->v_type
= type
;
2139 ufs_reset_vnode(ip
->i_vnode
);
2141 ip
->i_nlink
= 2; /* anticipating a call to dirmakedirect */
2146 if (op
== DE_ATTRDIR
) {
2147 ip
->i_uid
= vap
->va_uid
;
2148 ip
->i_gid
= vap
->va_gid
;
2150 ip
->i_uid
= crgetuid(cr
);
2152 * To determine the group-id of the created file:
2153 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0
2154 * clients are not likely to set the gid), then use it if
2155 * the process is privileged, belongs to the target group,
2156 * or the group is the same as the parent directory.
2157 * 2) If the filesystem was not mounted with the Old-BSD-compatible
2158 * GRPID option, and the directory's set-gid bit is clear,
2159 * then use the process's gid.
2160 * 3) Otherwise, set the group-id to the gid of the parent directory.
2162 if (op
!= DE_ATTRDIR
&& (vap
->va_mask
& AT_GID
) &&
2163 ((vap
->va_gid
== tdp
->i_gid
) || groupmember(vap
->va_gid
, cr
) ||
2164 secpolicy_vnode_create_gid(cr
) == 0)) {
2166 * XXX - is this only the case when a 4.0 NFS client, or a
2167 * client derived from that code, makes a call over the wire?
2169 ip
->i_gid
= vap
->va_gid
;
2171 ip
->i_gid
= (tdp
->i_mode
& ISGID
) ? tdp
->i_gid
: crgetgid(cr
);
2174 * For SunOS 5.0->5.4, the lines below read:
2176 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
2177 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
2179 * where MAXUID was set to 60002. See notes on this in ufs_inode.c
2182 (ulong_t
)ip
->i_uid
> (ulong_t
)USHRT_MAX
? UID_LONG
: ip
->i_uid
;
2184 (ulong_t
)ip
->i_gid
> (ulong_t
)USHRT_MAX
? GID_LONG
: ip
->i_gid
;
2187 * If we're creating a directory, and the parent directory has the
2188 * set-GID bit set, set it on the new directory.
2189 * Otherwise, if the user is neither privileged nor a member of the
2190 * file's new group, clear the file's set-GID bit.
2192 if ((tdp
->i_mode
& ISGID
) && (type
== VDIR
))
2193 ip
->i_mode
|= ISGID
;
2195 if ((ip
->i_mode
& ISGID
) &&
2196 secpolicy_vnode_setids_setgids(cr
, ip
->i_gid
) != 0)
2197 ip
->i_mode
&= ~ISGID
;
2200 if (((vap
->va_mask
& AT_ATIME
) && TIMESPEC_OVERFLOW(&vap
->va_atime
)) ||
2201 ((vap
->va_mask
& AT_MTIME
) && TIMESPEC_OVERFLOW(&vap
->va_mtime
))) {
2207 * Extended attribute directories are not subject to quotas.
2209 if (op
!= DE_ATTRDIR
)
2210 ip
->i_dquot
= getinoquota(ip
);
2214 if (op
== DE_MKDIR
|| op
== DE_ATTRDIR
) {
2215 err
= ufs_dirmakedirect(ip
, tdp
, (op
== DE_MKDIR
) ? 0 : 1, cr
);
2221 * generate the shadow inode and attach it to the new object
2223 ASSERT((tdp
->i_shadow
&& tdp
->i_ufs_acl
) ||
2224 (!tdp
->i_shadow
&& !tdp
->i_ufs_acl
));
2225 if (tdp
->i_shadow
&& tdp
->i_ufs_acl
&&
2226 (((tdp
->i_mode
& IFMT
) == IFDIR
) ||
2227 ((tdp
->i_mode
& IFMT
) == IFATTRDIR
))) {
2228 err
= ufs_si_inherit(ip
, tdp
, ip
->i_mode
, cr
);
2230 if (op
== DE_MKDIR
) {
2232 * clean up parent directory
2234 * tdp->i_contents already locked from
2238 TRANS_INODE(tdp
->i_ufsvfs
, tdp
);
2239 tdp
->i_flag
|= ICHG
;
2241 ufs_iupdat(tdp
, I_SYNC
);
2248 * If the passed in attributes contain atime and/or mtime
2249 * settings, then use them instead of using the current
2250 * high resolution time.
2252 if (vap
->va_mask
& (AT_MTIME
|AT_ATIME
)) {
2253 if (vap
->va_mask
& AT_ATIME
) {
2254 ip
->i_atime
.tv_sec
= vap
->va_atime
.tv_sec
;
2255 ip
->i_atime
.tv_usec
= vap
->va_atime
.tv_nsec
/ 1000;
2256 ip
->i_flag
&= ~IACC
;
2259 if (vap
->va_mask
& AT_MTIME
) {
2260 ip
->i_mtime
.tv_sec
= vap
->va_mtime
.tv_sec
;
2261 ip
->i_mtime
.tv_usec
= vap
->va_mtime
.tv_nsec
/ 1000;
2263 if (now
.tv_sec
> TIME32_MAX
) {
2265 * In 2038, ctime sticks forever..
2267 ip
->i_ctime
.tv_sec
= TIME32_MAX
;
2268 ip
->i_ctime
.tv_usec
= 0;
2270 ip
->i_ctime
.tv_sec
= now
.tv_sec
;
2271 ip
->i_ctime
.tv_usec
= now
.tv_nsec
/ 1000;
2273 ip
->i_flag
&= ~(IUPD
|ICHG
);
2274 ip
->i_flag
|= IMODTIME
;
2276 ip
->i_flag
|= IUPD
|ICHG
;
2279 ip
->i_flag
|= IACC
|IUPD
|ICHG
;
2283 * If this is an attribute tag it as one.
2285 if ((tdp
->i_mode
& IFMT
) == IFATTRDIR
) {
2286 ip
->i_cflags
|= IXATTR
;
2290 * push inode before it's name appears in a directory
2292 TRANS_INODE(ip
->i_ufsvfs
, ip
);
2293 ufs_iupdat(ip
, I_SYNC
);
2294 rw_exit(&ip
->i_contents
);
2298 /* Throw away inode we just allocated. */
2301 TRANS_INODE(ip
->i_ufsvfs
, ip
);
2305 rw_exit(&ip
->i_contents
);
2310 * Write a prototype directory into the empty inode ip, whose parent is dp.
2314 struct inode
*ip
, /* new directory */
2315 struct inode
*dp
, /* parent directory */
2319 struct dirtemplate
*dirp
;
2323 ASSERT(RW_WRITE_HELD(&ip
->i_contents
));
2324 ASSERT(RW_WRITE_HELD(&dp
->i_rwlock
));
2325 ASSERT(RW_WRITE_HELD(&dp
->i_contents
));
2327 * Allocate space for the directory we're creating.
2329 err
= BMAPALLOC(ip
, 0, DIRBLKSIZ
, cr
);
2332 if (DIRBLKSIZ
> dp
->i_fs
->fs_fsize
) {
2333 err
= ufs_fault(ITOV(dp
),
2334 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
2335 DIRBLKSIZ
, dp
->i_fs
->fs_fsize
,
2336 dp
->i_fs
->fs_fsmnt
);
2339 ip
->i_size
= DIRBLKSIZ
;
2340 TRANS_INODE(ip
->i_ufsvfs
, ip
);
2341 ip
->i_flag
|= IUPD
|ICHG
|IATTCHG
;
2345 * Update the tdp link count and write out the change.
2346 * This reflects the ".." entry we'll soon write.
2348 if (dp
->i_nlink
== MAXLINK
)
2352 TRANS_INODE(dp
->i_ufsvfs
, dp
);
2355 ufs_iupdat(dp
, I_SYNC
);
2357 * Initialize directory with "."
2358 * and ".." from static template.
2360 * Since the parent directory is locked, we don't have to
2361 * worry about anything changing when we drop the write
2365 err
= fbread(ITOV(ip
), 0, (uint_t
)ip
->i_fs
->fs_fsize
,
2371 dirp
= (struct dirtemplate
*)fbp
->fb_addr
;
2373 * Now initialize the directory we're creating
2374 * with the "." and ".." entries.
2376 *dirp
= mastertemplate
; /* structure assignment */
2377 dirp
->dot_ino
= (uint32_t)ip
->i_number
;
2378 dirp
->dotdot_ino
= (uint32_t)dp
->i_number
;
2380 err
= TRANS_DIR(ip
, 0);
2382 fbrelse(fbp
, S_OTHER
);
2386 err
= ufs_fbwrite(fbp
, ip
);
2396 TRANS_INODE(dp
->i_ufsvfs
, dp
);
2399 ufs_iupdat(dp
, I_SYNC
);
2404 * Delete a directory entry. If oip is nonzero the entry is checked
2405 * to make sure it still reflects oip.
2416 struct direct
*ep
, *pep
, *nep
;
2419 struct ufs_slot slot
;
2425 namlen
= (int)strlen(namep
);
2427 struct fs
*fs
= dp
->i_fs
;
2429 cmn_err(CE_WARN
, "%s: ufs_dirremove: attempted to remove"
2430 " nameless file in directory (directory inode %llu)",
2431 fs
->fs_fsmnt
, (u_longlong_t
)dp
->i_number
);
2432 ASSERT(namlen
!= 0);
2438 * return error when removing . and ..
2440 if (namep
[0] == '.') {
2443 else if (namlen
== 2 && namep
[1] == '.') {
2444 return (EEXIST
); /* SIGH should be ENOTEMPTY */
2448 ASSERT(RW_WRITE_HELD(&dp
->i_rwlock
));
2452 * Check accessibility of directory.
2454 if (err
= ufs_diraccess(dp
, IEXEC
|IWRITE
, cr
))
2459 slot
.status
= FOUND
; /* don't need to look for empty slot */
2460 rw_enter(&dp
->i_ufsvfs
->vfs_dqrwlock
, RW_READER
);
2461 rw_enter(&dp
->i_contents
, RW_WRITER
);
2463 err
= ufs_dircheckforname(dp
, namep
, namlen
, &slot
, &ip
, cr
, 0);
2471 if (oip
&& oip
!= ip
) {
2476 mode
= ip
->i_mode
& IFMT
;
2477 if (mode
== IFDIR
|| mode
== IFATTRDIR
) {
2480 * vn_vfsrlock() prevents races between mount and rmdir.
2482 if (vn_vfsrlock(vp
)) {
2486 if (vn_mountedvfs(vp
) != NULL
&& op
!= DR_RENAME
) {
2491 * If we are removing a directory, get a lock on it.
2492 * Taking a writer lock prevents a parallel ufs_dirlook from
2493 * incorrectly entering a negative cache vnode entry in the dnlc
2494 * If the directory is empty, it will stay empty until
2497 if (!rw_tryenter(&ip
->i_rwlock
, RW_WRITER
)) {
2499 * It is possible that a thread in rename would have
2500 * acquired this rwlock. To prevent a deadlock we
2501 * do a rw_tryenter. If we fail to get the lock
2502 * we drop all the locks we have acquired, wait
2503 * for 2 ticks and reacquire the
2504 * directory's (dp) i_rwlock and try again.
2505 * If we dont drop dp's i_rwlock then we will panic
2506 * with a "Deadlock: cycle in blocking chain"
2507 * since in ufs_dircheckpath we want dp's i_rwlock.
2508 * dp is guaranteed to exist since ufs_dirremove is
2509 * called after a VN_HOLD(dp) has been done.
2511 ufs_dirremove_retry_cnt
++;
2514 fbrelse(slot
.fbp
, S_OTHER
);
2515 rw_exit(&dp
->i_contents
);
2516 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
2517 rw_exit(&dp
->i_rwlock
);
2520 rw_enter(&dp
->i_rwlock
, RW_WRITER
);
2524 rw_enter(&ip
->i_contents
, RW_READER
);
2527 * Now check the restrictions that apply on sticky directories.
2529 if ((err
= ufs_sticky_remove_access(dp
, ip
, cr
)) != 0) {
2530 rw_exit(&ip
->i_contents
);
2531 if (mode
== IFDIR
|| mode
== IFATTRDIR
)
2532 rw_exit(&ip
->i_rwlock
);
2536 if (op
== DR_RMDIR
) {
2538 * For rmdir(2), some special checks are required.
2539 * (a) Don't remove any alias of the parent (e.g. ".").
2540 * (b) Don't remove the current directory.
2541 * (c) Make sure the entry is (still) a directory.
2542 * (d) Make sure the directory is empty.
2545 if (dp
== ip
|| vp
== cdir
)
2547 else if (((ip
->i_mode
& IFMT
) != IFDIR
) &&
2548 ((ip
->i_mode
& IFMT
) != IFATTRDIR
))
2550 else if ((ip
->i_nlink
> 2) ||
2551 !ufs_dirempty(ip
, dp
->i_number
, cr
)) {
2552 err
= EEXIST
; /* SIGH should be ENOTEMPTY */
2556 rw_exit(&ip
->i_contents
);
2557 if (mode
== IFDIR
|| mode
== IFATTRDIR
)
2558 rw_exit(&ip
->i_rwlock
);
2561 } else if (op
== DR_REMOVE
) {
2563 * unlink(2) requires a different check: allow only
2564 * privileged users to unlink a directory.
2566 if (vp
->v_type
== VDIR
&&
2567 secpolicy_fs_linkdir(cr
, vp
->v_vfsp
)) {
2569 rw_exit(&ip
->i_contents
);
2570 rw_exit(&ip
->i_rwlock
);
2575 rw_exit(&ip
->i_contents
);
2578 * Remove the cache'd entry, if any.
2581 dnlc_remove(dvp
, namep
);
2586 dcanchor_t
*dcap
= &dp
->i_danchor
;
2588 (void) dnlc_dir_rem_entry(dcap
, namep
, NULL
);
2589 if (((int)ep
->d_reclen
- (int)DIRSIZ(ep
)) >= LDIRSIZ(1)) {
2590 (void) dnlc_dir_rem_space_by_handle(dcap
, slot
.offset
);
2592 if (slot
.offset
& (DIRBLKSIZ
- 1)) {
2594 * Collapse new free space into previous entry.
2595 * Note, the previous entry has already been
2596 * validated in ufs_dircheckforname().
2599 pep
= (struct direct
*)((char *)ep
- slot
.size
);
2600 if ((pep
->d_ino
== 0) &&
2601 ((uintptr_t)pep
& (DIRBLKSIZ
- 1))) {
2602 dnlc_dir_purge(dcap
);
2607 extra
= pep
->d_reclen
- DIRSIZ(pep
);
2609 extra
= pep
->d_reclen
;
2611 if (extra
>= LDIRSIZ(1)) {
2612 (void) dnlc_dir_rem_space_by_handle(dcap
,
2613 (uint64_t)(slot
.offset
- slot
.size
));
2615 pep
->d_reclen
+= ep
->d_reclen
;
2616 (void) dnlc_dir_add_space(dcap
, extra
+ ep
->d_reclen
,
2617 (uint64_t)(slot
.offset
- slot
.size
));
2618 /* adjust the previous pointer in the next entry */
2619 nep
= (struct direct
*)((char *)ep
+ ep
->d_reclen
);
2620 if ((uintptr_t)nep
& (DIRBLKSIZ
- 1)) {
2624 * Check the validity of the entry.
2625 * If it's bad, then throw away the cache and
2628 if ((nep
->d_reclen
== 0) ||
2629 (nep
->d_reclen
& 0x3) ||
2630 (dnlc_dir_update(dcap
, nep
->d_name
,
2631 INO_OFF_TO_H(nep
->d_ino
,
2632 slot
.offset
- slot
.size
)) == DNOENT
)) {
2633 dnlc_dir_purge(dcap
);
2638 (void) dnlc_dir_add_space(dcap
, ep
->d_reclen
,
2639 (uint64_t)slot
.offset
);
2643 * If the entry isn't the first in the directory, we must
2644 * reclaim the space of the now empty record by adding
2645 * the record size to the size of the previous entry.
2647 if (slot
.offset
& (DIRBLKSIZ
- 1)) {
2649 * Collapse new free space into previous entry.
2651 pep
= (struct direct
*)((char *)ep
- slot
.size
);
2652 pep
->d_reclen
+= ep
->d_reclen
;
2658 err
= TRANS_DIR(dp
, slot
.offset
);
2660 fbrelse(slot
.fbp
, S_OTHER
);
2662 err
= ufs_fbwrite(slot
.fbp
, dp
);
2666 * If we were removing a directory, it is 'gone' now, but we cannot
2667 * unlock it as a thread may be waiting for the lock in ufs_create. If
2668 * we did, it could then create a file in a deleted directory.
2672 if (mode
== IFDIR
|| mode
== IFATTRDIR
)
2673 rw_exit(&ip
->i_rwlock
);
2677 rw_enter(&ip
->i_contents
, RW_WRITER
);
2679 dp
->i_flag
|= IUPD
|ICHG
;
2684 TRANS_INODE(dp
->i_ufsvfs
, dp
);
2685 TRANS_INODE(ip
->i_ufsvfs
, ip
);
2687 * Now dispose of the inode.
2689 if (ip
->i_nlink
> 0) {
2691 * This is not done for IFATTRDIR's because they don't
2692 * have entries in the dnlc and the link counts are
2693 * not incremented when they are created.
2695 if (op
== DR_RMDIR
&& (ip
->i_mode
& IFMT
) == IFDIR
) {
2697 * Decrement by 2 because we're trashing the "."
2698 * entry as well as removing the entry in dp.
2699 * Clear the directory entry, but there may be
2700 * other hard links so don't free the inode.
2701 * Decrement the dp linkcount because we're
2702 * trashing the ".." entry.
2708 * XXX need to discard negative cache entries
2709 * for vp. See comment in ufs_delete().
2711 dnlc_remove(vp
, ".");
2712 dnlc_remove(vp
, "..");
2714 * The return value is ignored here bacause if
2715 * the directory purge fails we don't want to
2716 * stop the delete. If ufs_dirpurgedotdot fails
2717 * the delete will continue with the preexiting
2720 (void) ufs_dirpurgedotdot(ip
, dp
->i_number
, cr
);
2729 if (!TRANS_ISTRANS(dp
->i_ufsvfs
))
2730 ufs_iupdat(dp
, I_SYNC
);
2731 if (!TRANS_ISTRANS(ip
->i_ufsvfs
))
2732 ufs_iupdat(ip
, I_SYNC
);
2734 rw_exit(&ip
->i_contents
);
2735 if (mode
== IFDIR
|| mode
== IFATTRDIR
)
2736 rw_exit(&ip
->i_rwlock
);
2738 if (mode
== IFDIR
|| mode
== IFATTRDIR
) {
2742 ASSERT(RW_WRITE_HELD(&dp
->i_contents
));
2745 fbrelse(slot
.fbp
, S_OTHER
);
2747 rw_exit(&dp
->i_contents
);
2748 rw_exit(&dp
->i_ufsvfs
->vfs_dqrwlock
);
2751 * Release (and delete) the inode after we drop vfs_dqrwlock to
2752 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
2761 * Return buffer with contents of block "offset"
2762 * from the beginning of directory "ip". If "res"
2763 * is non-zero, fill it in with a pointer to the
2764 * remaining space in the directory.
2781 CPU_STATS_ADD_K(sys
, ufsdirblk
, 1);
2783 lbn
= (daddr_t
)lblkno(fs
, offset
);
2784 bsize
= (uint_t
)blksize(fs
, ip
, lbn
);
2785 err
= fbread(ITOV(ip
), (offset_t
)(offset
& fs
->fs_bmask
),
2786 bsize
, S_READ
, &fbp
);
2792 *res
= fbp
->fb_addr
+ blkoff(fs
, offset
);
2798 * Do consistency checking:
2799 * record length must be multiple of 4
2800 * entry must fit in rest of its DIRBLKSIZ block
2801 * record must be large enough to contain entry
2802 * name is not longer than MAXNAMLEN
2803 * name must be as long as advertised, and null terminated
2804 * NOTE: record length must not be zero (should be checked previously).
2805 * This routine is only called if dirchk is true.
2806 * It would be nice to set the FSBAD flag in the super-block when
2807 * this routine fails so that a fsck is forced on next reboot,
2808 * but locking is a problem.
2814 int entryoffsetinblock
,
2819 i
= DIRBLKSIZ
- (entryoffsetinblock
& (DIRBLKSIZ
- 1));
2820 if ((ep
->d_reclen
& 0x3) != 0 || (int)ep
->d_reclen
> i
||
2821 (uint_t
)ep
->d_reclen
< DIRSIZ(ep
) || ep
->d_namlen
> MAXNAMLEN
||
2822 ep
->d_ino
&& dirbadname(ep
->d_name
, (int)ep
->d_namlen
)) {
2823 dirbad(dp
, "mangled entry", offset
);
2830 dirbad(struct inode
*ip
, char *how
, off_t offset
)
2832 cmn_err(CE_NOTE
, "%s: bad dir ino %d at offset %ld: %s",
2833 ip
->i_fs
->fs_fsmnt
, (int)ip
->i_number
, offset
, how
);
2837 dirbadname(char *sp
, int l
)
2839 while (l
--) { /* check for nulls */
2840 if (*sp
++ == '\0') {
2844 return (*sp
); /* check for terminating null */
2848 * Check if a directory is empty or not.
2856 return (ufs_dirscan(ip
, parentino
, cr
, 0));
2860 * clear the .. directory entry.
2868 return (ufs_dirscan(ip
, parentino
, cr
, 1));
2872 * Scan the directoy. If clr_dotdot is true clear the ..
2873 * directory else check to see if the directory is empty.
2875 * Using a struct dirtemplate here is not precisely
2876 * what we want, but better than using a struct direct.
2878 * clr_dotdot is used as a flag to tell us if we need
2879 * to clear the dotdot entry
2881 * N.B.: does not handle corrupted directories.
2891 struct dirtemplate dbuf
;
2892 struct direct
*dp
= (struct direct
*)&dbuf
;
2894 int empty
= 1; /* Assume it's empty */
2895 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
2897 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
2899 ASSERT(ip
->i_size
<= (offset_t
)MAXOFF_T
);
2900 for (off
= 0; off
< ip
->i_size
; off
+= dp
->d_reclen
) {
2901 err
= ufs_rdwri(UIO_READ
, FREAD
, ip
, (caddr_t
)dp
,
2902 (ssize_t
)MINDIRSIZ
, off
, UIO_SYSSPACE
, &count
, cr
);
2904 * Since we read MINDIRSIZ, residual must
2905 * be 0 unless we're at end of file.
2907 if (err
|| count
!= 0 || dp
->d_reclen
== 0) {
2911 /* skip empty entries */
2914 /* accept only "." and ".." */
2915 if (dp
->d_namlen
> 2 || dp
->d_name
[0] != '.') {
2920 * At this point d_namlen must be 1 or 2.
2921 * 1 implies ".", 2 implies ".." if second
2924 if (dp
->d_namlen
== 1)
2926 if (dp
->d_name
[1] == '.' &&
2927 (ino_t
)dp
->d_ino
== parentino
) {
2929 * If we're doing a purge we need to check for
2930 * the . and .. entries and clear the d_ino for ..
2932 * if clr_dotdot is set ufs_dirscan does not
2933 * check for an empty directory.
2937 * Have to actually zap the ..
2938 * entry in the directory, as
2939 * otherwise someone might have
2940 * dp as its cwd and try to
2941 * open .., which now points to
2942 * an unallocated inode.
2944 empty
= ufs_dirclrdotdot(ip
, parentino
);
2956 clock_t retry_backoff_delay
= 1; /* delay before retrying the i_rwlock */
2957 uint64_t dircheck_retry_cnt
;
2959 * Check if source directory inode is in the path of the target directory.
2960 * Target is supplied locked.
2962 * The source and target inode's should be different upon entry.
2967 struct inode
*target
,
2972 struct dirtemplate
*dirp
;
2974 struct ufsvfs
*ufsvfsp
;
2979 ASSERT(target
->i_ufsvfs
!= NULL
);
2980 ASSERT(RW_LOCK_HELD(&target
->i_rwlock
));
2981 ASSERT(RW_LOCK_HELD(&sdp
->i_rwlock
));
2984 if (ip
->i_number
== source_ino
) {
2988 if (ip
->i_number
== UFSROOTINO
) {
2993 * Search back through the directory tree, using the ".." entries.
2994 * Fail any attempt to move a directory into an ancestor directory.
3000 err
= blkatoff(ip
, (off_t
)0, (char **)&dirp
, &fbp
);
3003 if (((ip
->i_mode
& IFMT
) != IFDIR
) || ip
->i_nlink
== 0 ||
3004 ip
->i_size
< sizeof (struct dirtemplate
)) {
3005 dirbad(ip
, "bad size, unlinked or not dir", (off_t
)0);
3009 if (dirp
->dotdot_namlen
!= 2 ||
3010 dirp
->dotdot_name
[0] != '.' ||
3011 dirp
->dotdot_name
[1] != '.') {
3012 dirbad(ip
, "mangled .. entry", (off_t
)0);
3013 err
= ENOTDIR
; /* Sanity check */
3016 dotdotino
= (ino_t
)dirp
->dotdot_ino
;
3017 if (dotdotino
== source_ino
) {
3021 if (dotdotino
== UFSROOTINO
)
3024 fbrelse(fbp
, S_OTHER
);
3028 ufsvfsp
= ip
->i_ufsvfs
;
3031 rw_exit(&ip
->i_rwlock
);
3035 * Race to get the inode.
3037 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
3038 if (err
= ufs_iget_alloced(vfs
, dotdotino
, &tip
, cr
)) {
3039 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
3043 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
3045 * If the directory of the source inode (also a directory)
3046 * is the same as this next entry up the chain, then
3047 * we know the source directory itself can't be in the
3048 * chain. This also prevents a panic because we already
3049 * have sdp->i_rwlock locked.
3059 * If someone has set the WRITE_WANTED bit in this lock and if
3060 * this happens to be a sdp or tdp of another parallel rename
3061 * which is executing the same code and in similar situation
3062 * we end up in a 4 way deadlock. We need to make sure that
3063 * the WRITE_WANTED bit is not set.
3066 if (!rw_tryenter(&ip
->i_rwlock
, RW_READER
)) {
3068 * If the lock held as WRITER thats fine but if it
3069 * has WRITE_WANTED bit set we might end up in a
3070 * deadlock. If WRITE_WANTED is set we return
3071 * with EAGAIN else we just go back and try.
3073 if (RW_ISWRITER(&ip
->i_rwlock
) &&
3074 !(RW_WRITE_HELD(&ip
->i_rwlock
))) {
3077 fbrelse(fbp
, S_OTHER
);
3083 * The lock is being write held. We could
3084 * just do a rw_enter here but there is a
3085 * window between the check and now, where
3086 * the status could have changed, so to
3087 * avoid looping we backoff and go back to
3090 delay(retry_backoff_delay
);
3091 dircheck_retry_cnt
++;
3097 fbrelse(fbp
, S_OTHER
);
3102 rw_exit(&ip
->i_rwlock
);
3110 ufs_xattrdirempty(struct inode
*ip
, ino_t parentino
, struct cred
*cr
)
3113 struct dirtemplate dbuf
;
3114 struct direct
*dp
= (struct direct
*)&dbuf
;
3116 int empty
= 1; /* Assume it's empty */
3117 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
3119 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
3121 ASSERT(ip
->i_size
<= (offset_t
)MAXOFF_T
);
3122 for (off
= 0; off
< ip
->i_size
; off
+= dp
->d_reclen
) {
3123 err
= ufs_rdwri(UIO_READ
, FREAD
, ip
, (caddr_t
)dp
,
3124 (ssize_t
)MINDIRSIZ
, off
, UIO_SYSSPACE
, &count
, cr
);
3126 * Since we read MINDIRSIZ, residual must
3127 * be 0 unless we're at end of file.
3130 if (err
|| count
!= 0 || dp
->d_reclen
== 0) {
3134 /* skip empty entries */
3138 * At this point d_namlen must be 1 or 2.
3139 * 1 implies ".", 2 implies ".." if second
3143 if (dp
->d_namlen
== 1 && dp
->d_name
[0] == '.' &&
3144 (ino_t
)dp
->d_ino
== parentino
)
3147 if (dp
->d_namlen
== 2 && dp
->d_name
[0] == '.' &&
3148 dp
->d_name
[1] == '.') {
3159 * Allocate and initialize a new shadow inode to contain extended attributes.
3172 struct ufsvfs
*ufsvfsp
;
3173 struct ulockfs
*ulp
;
3176 int dorwlock
; /* 0 = not yet taken, */
3177 /* 1 = taken outside the transaction, */
3178 /* 2 = taken inside the transaction */
3181 * Validate permission to create attribute directory
3184 if ((err
= ufs_iaccess(tdp
, IWRITE
, cr
, 1)) != 0) {
3188 if (vn_is_readonly(ITOV(tdp
)))
3192 * No need to re-init err after again:, since it's set before
3193 * the next use of it.
3198 va
.va_uid
= tdp
->i_uid
;
3199 va
.va_gid
= tdp
->i_gid
;
3201 if ((tdp
->i_mode
& IFMT
) == IFDIR
) {
3202 va
.va_mode
= (o_mode_t
)IFATTRDIR
;
3203 va
.va_mode
|= tdp
->i_mode
& 0777;
3205 va
.va_mode
= (o_mode_t
)IFATTRDIR
|0700;
3206 if (tdp
->i_mode
& 0040)
3208 if (tdp
->i_mode
& 0004)
3211 va
.va_mask
= AT_TYPE
|AT_MODE
;
3213 ufsvfsp
= tdp
->i_ufsvfs
;
3215 err
= ufs_lockfs_begin(ufsvfsp
, &ulp
, ULOCKFS_MKDIR_MASK
);
3220 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
3221 * This follows the protocol for read()/write().
3223 if (ITOV(tdp
)->v_type
!= VDIR
) {
3224 rw_enter(&tdp
->i_rwlock
, RW_WRITER
);
3229 trans_size
= (int)TOP_MKDIR_SIZE(tdp
);
3230 TRANS_BEGIN_CSYNC(ufsvfsp
, &issync
, TOP_MKDIR
, trans_size
);
3234 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
3235 * This follows the protocol established by
3236 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
3238 if (dorwlock
== 0) {
3239 rw_enter(&tdp
->i_rwlock
, RW_WRITER
);
3242 rw_enter(&ufsvfsp
->vfs_dqrwlock
, RW_READER
);
3243 rw_enter(&tdp
->i_contents
, RW_WRITER
);
3246 * Suppress out of inodes messages if we will retry.
3249 tdp
->i_flag
|= IQUIET
;
3250 err
= ufs_dirmakeinode(tdp
, &ip
, &va
, DE_ATTRDIR
, cr
);
3251 tdp
->i_flag
&= ~IQUIET
;
3259 * Now attach it to src file.
3262 tdp
->i_oeftflag
= ip
->i_number
;
3265 ip
->i_cflags
|= IXATTR
;
3266 ITOV(ip
)->v_flag
|= V_XATTRDIR
;
3267 TRANS_INODE(ufsvfsp
, tdp
);
3268 tdp
->i_flag
|= ICHG
| IUPD
;
3270 ufs_iupdat(tdp
, I_SYNC
);
3271 rw_exit(&tdp
->i_contents
);
3272 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
3274 rw_enter(&ip
->i_rwlock
, RW_WRITER
);
3275 rw_enter(&ip
->i_contents
, RW_WRITER
);
3276 TRANS_INODE(ufsvfsp
, ip
);
3277 ip
->i_flag
|= ICHG
| IUPD
;
3279 ufs_iupdat(ip
, I_SYNC
);
3280 rw_exit(&ip
->i_contents
);
3281 rw_exit(&ip
->i_rwlock
);
3283 rw_exit(&tdp
->i_rwlock
);
3287 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_MKDIR
, trans_size
);
3288 ufs_lockfs_end(ulp
);
3293 rw_exit(&tdp
->i_rwlock
);
3298 rw_exit(&tdp
->i_contents
);
3299 rw_exit(&ufsvfsp
->vfs_dqrwlock
);
3301 rw_exit(&tdp
->i_rwlock
);
3303 TRANS_END_CSYNC(ufsvfsp
, &err
, issync
, TOP_MKDIR
, trans_size
);
3304 ufs_lockfs_end(ulp
);
3307 rw_exit(&tdp
->i_rwlock
);
3312 * No inodes? See if any are tied up in pending deletions.
3313 * This has to be done outside of any of the above, because
3314 * the draining operation can't be done from inside a transaction.
3316 if ((err
== ENOSPC
) && retry
&& TRANS_ISTRANS(ufsvfsp
)) {
3317 ufs_delete_drain_wait(ufsvfsp
, 1);
3326 * clear the dotdot directory entry.
3327 * Used by ufs_dirscan when clr_dotdot
3328 * flag is set and we're deleting a
3332 ufs_dirclrdotdot(struct inode
*ip
, ino_t parentino
)
3335 struct direct
*dotp
, *dotdotp
;
3338 ASSERT(RW_WRITE_HELD(&ip
->i_rwlock
));
3339 ASSERT(RW_LOCK_HELD(&ip
->i_contents
));
3340 err
= blkatoff(ip
, 0, NULL
, &fbp
);
3345 dotp
= (struct direct
*)fbp
->fb_addr
;
3346 if ((dotp
->d_namlen
< (MAXNAMLEN
+ 1)) &&
3347 ((DIRBLKSIZ
- DIRSIZ(dotp
)) >= (sizeof (struct dirtemplate
) / 2))) {
3348 dotdotp
= (struct direct
*)((char *)dotp
+ dotp
->d_reclen
);
3349 if ((dotdotp
->d_namlen
< (MAXNAMLEN
+ 1)) &&
3350 ((DIRBLKSIZ
- DIRSIZ(dotp
)) >= dotdotp
->d_reclen
)) {
3352 dotp
->d_reclen
+= dotdotp
->d_reclen
;
3353 if (parentino
== dotdotp
->d_ino
) {
3355 dotdotp
->d_namlen
= 0;
3356 dotdotp
->d_reclen
= 0;
3359 err
= TRANS_DIR(ip
, 0);
3361 fbrelse(fbp
, S_OTHER
);
3363 err
= ufs_fbwrite(fbp
, ip
);