From: Matthew Dillon <dillon@dragonflybsd.org>
Date: Sun, 10 Aug 2008 17:01:09 +0000 (+0000)
Subject: MFC HAMMER 2.0:02 - rmdir, stability
X-Git-Tag: v2.0.1~26
X-Git-Url: https://repo.or.cz/w/dragonfly.git/commitdiff_plain/4c68eed06a99a6c8b07283cf51d8485c104e36cb

MFC HAMMER 2.0:02 - rmdir, stability
---

diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c
index fac61e8cef..cbded3a4f1 100644
--- a/sys/vfs/hammer/hammer_inode.c
+++ b/sys/vfs/hammer/hammer_inode.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103.2.4 2008/08/06 15:41:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103.2.5 2008/08/10 17:01:08 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1606,9 +1606,9 @@ hammer_setup_parent_inodes_helper(hammer_record_t record,
 #endif
 	if (pip->flush_group == flg) {
 		/*
-		 * If the parent is in the same flush group as us we can
-		 * just set the record to a flushing state and we are
-		 * done.
+		 * Because we have not calculated nlinks yet we can just
+		 * set records to the flush state if the parent is in
+		 * the same flush group as we are.
 		 */
 		record->flush_state = HAMMER_FST_FLUSH;
 		record->flush_group = flg;
@@ -1674,12 +1674,14 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 	 * Figure out how many in-memory records we can actually flush
 	 * (not including inode meta-data, buffers, etc).
 	 */
+	KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
 	if (flags & HAMMER_FLUSH_RECURSION) {
 		/*
 		 * If this is a upwards recursion we do not want to
 		 * recurse down again!
 		 */
 		go_count = 1;
+#if 0
 	} else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
 		/*
 		 * No new records are added if we must complete a flush
@@ -1691,6 +1693,7 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 				   hammer_syncgrp_child_callback, NULL);
 #endif
 		go_count = 1;
+#endif
 	} else {
 		/*
 		 * Normal flush, scan records and bring them into the flush.
@@ -1750,17 +1753,8 @@ hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
 	 * NOTE: The DELETING flag is a mod flag, but it is also sticky,
 	 * and stays in ip->flags.  Once set, it stays set until the
 	 * inode is destroyed.
-	 *
-	 * NOTE: If a truncation from a previous flush cycle had to be
-	 * continued into this one, the TRUNCATED flag will still be
-	 * set in sync_flags as will WOULDBLOCK.  When this occurs
-	 * we CANNOT safely integrate a new truncation from the front-end
-	 * because there may be data records in-memory assigned a flush
-	 * state from the previous cycle that are supposed to be flushed
-	 * before the next frontend truncation.
 	 */
-	if ((ip->flags & (HAMMER_INODE_TRUNCATED | HAMMER_INODE_WOULDBLOCK)) ==
-	    HAMMER_INODE_TRUNCATED) {
+	if (ip->flags & HAMMER_INODE_TRUNCATED) {
 		KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
 		ip->sync_trunc_off = ip->trunc_off;
 		ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
@@ -1960,12 +1954,7 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
 		break;
 	case HAMMER_FST_FLUSH:
 		/* 
-		 * If the WOULDBLOCK flag is set records may have been left
-		 * over from a previous flush attempt.  The flush group will
-		 * have been left intact - we are probably reflushing it
-		 * now.
-		 *
-		 * If a flush error occured ip->error will be non-zero.
+		 * The flush_group should already match.
 		 */
 		KKASSERT(rec->flush_group == flg);
 		r = 1;
@@ -2081,10 +2070,11 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
 		/*
 		 * We were unable to flush out all our records, leave the
 		 * inode in a flush state and in the current flush group.
+		 * The flush group will be re-run.
 		 *
-		 * This occurs if the UNDO block gets too full
-		 * or there is too much dirty meta-data and allows the
-		 * flusher to finalize the UNDO block and then re-flush.
+		 * This occurs if the UNDO block gets too full or there is
+		 * too much dirty meta-data and allows the flusher to
+		 * finalize the UNDO block and then re-flush.
 		 */
 		ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
 		dorel = 0;
@@ -2124,21 +2114,21 @@ hammer_flush_inode_done(hammer_inode_t ip, int error)
 			ip->flags &= ~HAMMER_INODE_FLUSHW;
 			wakeup(&ip->flags);
 		}
-	}
 
-	/*
-	 * If the frontend made more changes and requested another flush,
-	 * then try to get it running.
-	 *
-	 * Reflushes are aborted when the inode is errored out.
-	 */
-	if (ip->flags & HAMMER_INODE_REFLUSH) {
-		ip->flags &= ~HAMMER_INODE_REFLUSH;
-		if (ip->flags & HAMMER_INODE_RESIGNAL) {
-			ip->flags &= ~HAMMER_INODE_RESIGNAL;
-			hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
-		} else {
-			hammer_flush_inode(ip, 0);
+		/*
+		 * If the frontend made more changes and requested another
+		 * flush, then try to get it running.
+		 *
+		 * Reflushes are aborted when the inode is errored out.
+		 */
+		if (ip->flags & HAMMER_INODE_REFLUSH) {
+			ip->flags &= ~HAMMER_INODE_REFLUSH;
+			if (ip->flags & HAMMER_INODE_RESIGNAL) {
+				ip->flags &= ~HAMMER_INODE_RESIGNAL;
+				hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
+			} else {
+				hammer_flush_inode(ip, 0);
+			}
 		}
 	}
 
@@ -2317,7 +2307,7 @@ done:
 }
 
 /*
- * XXX error handling
+ * Backend function called by the flusher to sync an inode to media.
  */
 int
 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c
index e4d306cfee..e84592a660 100644
--- a/sys/vfs/hammer/hammer_object.c
+++ b/sys/vfs/hammer/hammer_object.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.90.2.4 2008/08/06 15:41:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.90.2.5 2008/08/10 17:01:08 dillon Exp $
  */
 
 #include "hammer.h"
@@ -40,6 +40,7 @@ static int hammer_mem_lookup(hammer_cursor_t cursor);
 static int hammer_mem_first(hammer_cursor_t cursor);
 static int hammer_frontend_trunc_callback(hammer_record_t record,
 				void *data __unused);
+static int hammer_bulk_scan_callback(hammer_record_t record, void *data);
 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
 		      hammer_btree_leaf_elm_t leaf);
@@ -49,6 +50,11 @@ struct rec_trunc_info {
 	int64_t		trunc_off;
 };
 
+struct hammer_bulk_info {
+	hammer_record_t record;
+	struct hammer_btree_leaf_elm leaf;
+};
+
 /*
  * Red-black tree support.  Comparison code for insertion.
  */
@@ -108,52 +114,46 @@ hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
 }
 
 /*
- * Special LOOKUP_INFO to locate an overlapping record.  This used by
- * the reservation code to implement small-block records (whos keys will
- * be different depending on data_len, when representing the same base
- * offset).
+ * Ranged scan to locate overlapping record(s).  This is used by
+ * hammer_ip_get_bulk() to locate an overlapping record.  We have
+ * to use a ranged scan because the keys for data records with the
+ * same file base offset can be different due to differing data_len's.
  *
  * NOTE: The base file offset of a data record is (key - data_len), not (key).
  */
 static int
-hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec)
+hammer_rec_overlap_cmp(hammer_record_t rec, void *data)
 {
-	if (leaf->base.rec_type < rec->leaf.base.rec_type)
+	struct hammer_bulk_info *info = data;
+	hammer_btree_leaf_elm_t leaf = &info->leaf;
+
+	if (rec->leaf.base.rec_type < leaf->base.rec_type)
 		return(-3);
-	if (leaf->base.rec_type > rec->leaf.base.rec_type)
+	if (rec->leaf.base.rec_type > leaf->base.rec_type)
 		return(3);
 
 	/*
 	 * Overlap compare
 	 */
 	if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
-		/* leaf_end <= rec_beg */
-		if (leaf->base.key <= rec->leaf.base.key - rec->leaf.data_len)
-			return(-2);
-		/* leaf_beg >= rec_end */
-		if (leaf->base.key - leaf->data_len >= rec->leaf.base.key)
+		/* rec_beg >= leaf_end */
+		if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key)
 			return(2);
+		/* rec_end <= leaf_beg */
+		if (rec->leaf.base.key <= leaf->base.key - leaf->data_len)
+			return(-2);
 	} else {
-		if (leaf->base.key < rec->leaf.base.key)
+		if (rec->leaf.base.key < leaf->base.key)
 			return(-2);
-		if (leaf->base.key > rec->leaf.base.key)
+		if (rec->leaf.base.key > leaf->base.key)
 			return(2);
 	}
 
 	/*
-	 * Never match against an item deleted by the front-end.
-	 * leaf is less then rec if rec is marked deleted.
-	 *
-	 * We must still return the proper code for the scan to continue
-	 * along the correct branches.
+	 * We have to return 0 at this point, even if DELETED_FE is set,
+	 * because returning anything else will cause the scan to ignore
+	 * one of the branches when we really want it to check both.
 	 */
-	if (rec->flags & HAMMER_RECF_DELETED_FE) {
-		if (leaf->base.key < rec->leaf.base.key)
-			return(-2);
-		if (leaf->base.key > rec->leaf.base.key)
-			return(2);
-		return(-1);
-	}
         return(0);
 }
 
@@ -240,8 +240,6 @@ hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
 }
 
 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
-RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
-		    hammer_rec_overlap_compare, hammer_btree_leaf_elm_t);
 
 /*
  * Allocate a record for the caller to finish filling in.  The record is
@@ -827,27 +825,44 @@ hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
  * to queue the BIO to the flusher.  Only the related record gets queued
  * to the flusher.
  */
+
 static hammer_record_t
 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
 {
-	hammer_record_t record;
-	struct hammer_btree_leaf_elm leaf;
+	struct hammer_bulk_info info;
+	
+	bzero(&info, sizeof(info));
+	info.leaf.base.obj_id = ip->obj_id;
+	info.leaf.base.key = file_offset + bytes;
+	info.leaf.base.create_tid = 0;
+	info.leaf.base.delete_tid = 0;
+	info.leaf.base.rec_type = HAMMER_RECTYPE_DATA;
+	info.leaf.base.obj_type = 0;				/* unused */
+	info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;	/* unused */
+	info.leaf.base.localization = ip->obj_localization +	/* unused */
+				      HAMMER_LOCALIZE_MISC;
+	info.leaf.data_len = bytes;
 
-	bzero(&leaf, sizeof(leaf));
-	leaf.base.obj_id = ip->obj_id;
-	leaf.base.key = file_offset + bytes;
-	leaf.base.create_tid = 0;
-	leaf.base.delete_tid = 0;
-	leaf.base.rec_type = HAMMER_RECTYPE_DATA;
-	leaf.base.obj_type = 0;			/* unused */
-	leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;	/* unused */
-	leaf.base.localization = ip->obj_localization + HAMMER_LOCALIZE_MISC;
-	leaf.data_len = bytes;
-
-	record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &leaf);
-	if (record)
-		hammer_ref(&record->lock);
-	return(record);
+	hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp,
+				   hammer_bulk_scan_callback, &info);
+
+	return(info.record);	/* may be NULL */
+}
+
+/*
+ * Take records vetted by overlap_cmp.  The first non-deleted record
+ * (if any) stops the scan.
+ */
+static int
+hammer_bulk_scan_callback(hammer_record_t record, void *data)
+{
+	struct hammer_bulk_info *info = data;
+
+	if (record->flags & HAMMER_RECF_DELETED_FE)
+		return(0);
+	hammer_ref(&record->lock);
+	info->record = record;
+	return(-1);			/* stop scan */
 }
 
 /*
@@ -870,10 +885,10 @@ hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
 
 	/*
 	 * Deal with conflicting in-memory records.  We cannot have multiple
-	 * in-memory records for the same offset without seriously confusing
-	 * the backend, including but not limited to the backend issuing
-	 * delete-create-delete sequences and asserting on the delete_tid
-	 * being the same as the create_tid.
+	 * in-memory records for the same base offset without seriously
+	 * confusing the backend, including but not limited to the backend
+	 * issuing delete-create-delete or create-delete-create sequences
+	 * and asserting on the delete_tid being the same as the create_tid.
 	 *
 	 * If we encounter a record with the backend interlock set we cannot
 	 * immediately delete it without confusing the backend.
diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c
index e98d5bb747..2ff9f11be5 100644
--- a/sys/vfs/hammer/hammer_vnops.c
+++ b/sys/vfs/hammer/hammer_vnops.c
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.91.2.4 2008/08/06 15:41:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.91.2.5 2008/08/10 17:01:09 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -156,7 +156,8 @@ struct hammer_inode *HammerTruncIp;
 #endif
 
 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
-			   struct vnode *dvp, struct ucred *cred, int flags);
+			   struct vnode *dvp, struct ucred *cred,
+			   int flags, int isdir);
 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
 
@@ -1491,7 +1492,7 @@ hammer_vop_nremove(struct vop_nremove_args *ap)
 
 	hammer_start_transaction(&trans, dip->hmp);
 	++hammer_stats_file_iopsw;
-	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
+	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
 	hammer_done_transaction(&trans);
 
 	return (error);
@@ -1550,7 +1551,8 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
 	 * Force the inode sync-time to match the transaction so it is
 	 * in-sync with the creation of the target directory entry.
 	 */
-	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
+	error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
+				ap->a_cred, 0, -1);
 	if (error == 0 || error == ENOENT) {
 		error = hammer_ip_add_directory(&trans, tdip,
 						tncp->nc_name, tncp->nc_nlen,
@@ -1652,8 +1654,6 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
 	int error;
 
 	dip = VTOI(ap->a_dvp);
-	if (dip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY)
-		return(ENOTDIR);
 
 	if (hammer_nohistory(dip) == 0 &&
 	    (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
@@ -1662,7 +1662,7 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
 
 	hammer_start_transaction(&trans, dip->hmp);
 	++hammer_stats_file_iopsw;
-	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
+	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
 	hammer_done_transaction(&trans);
 
 	return (error);
@@ -1994,7 +1994,7 @@ hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
 	hammer_start_transaction(&trans, dip->hmp);
 	++hammer_stats_file_iopsw;
 	error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
-				ap->a_cred, ap->a_flags);
+				ap->a_cred, ap->a_flags, -1);
 	hammer_done_transaction(&trans);
 
 	return (error);
@@ -2630,7 +2630,8 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
  */
 static int
 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
-		struct vnode *dvp, struct ucred *cred, int flags)
+		struct vnode *dvp, struct ucred *cred, 
+		int flags, int isdir)
 {
 	struct namecache *ncp;
 	hammer_inode_t dip;
@@ -2713,6 +2714,20 @@ retry:
 		}
 
 		/*
+		 * If isdir >= 0 we validate that the entry is or is not a
+		 * directory.  If isdir < 0 we don't care.
+		 */
+		if (error == 0 && isdir >= 0) {
+			if (isdir &&
+			    ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
+				error = ENOTDIR;
+			} else if (isdir == 0 &&
+			    ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
+				error = EISDIR;
+			}
+		}
+
+		/*
 		 * If we are trying to remove a directory the directory must
 		 * be empty.
 		 *