HAMMER 2.0:01 - MFC HAMMER 2.1:01
authorMatthew Dillon <dillon@dragonflybsd.org>
Wed, 6 Aug 2008 15:41:56 +0000 (6 15:41 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Wed, 6 Aug 2008 15:41:56 +0000 (6 15:41 +0000)
MFC buffer cache leak, invalidation races, b-tree deletion, nlinks,
and rmdir fixes.

sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_cursor.h
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_vnops.c

index 98fe254..1731612 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.71.2.4 2008/08/02 21:24:27 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.71.2.5 2008/08/06 15:41:56 dillon Exp $
  */
 
 /*
@@ -2072,8 +2072,8 @@ hammer_btree_correct_lhb(hammer_cursor_t cursor, hammer_tid_t tid)
  * (cursor->node).  Returns 0 on success, EDEADLK if we could not complete
  * the operation due to a deadlock, or some other error.
  *
- * This routine is always called with an empty, locked leaf but may recurse
- * into want-to-be-empty parents as part of its operation.
+ * This routine is initially called with an empty leaf and may be
+ * recursively called with single-element internal nodes.
  *
  * It should also be noted that when removing empty leaves we must be sure
  * to test and update mirror_tid because another thread may have deadlocked
@@ -2118,6 +2118,13 @@ btree_remove(hammer_cursor_t cursor)
         * Attempt to remove the parent's reference to the child.  If the
         * parent would become empty we have to recurse.  If we fail we 
         * leave the parent pointing to an empty leaf node.
+        *
+        * We have to recurse successfully before we can delete the internal
+        * node as it is illegal to have empty internal nodes.  Even though
+        * the operation may be aborted we must still fixup any unlocked
+        * cursors as if we had deleted the element prior to recursing
+        * (by calling hammer_cursor_deleted_element()) so those cursors
+        * are properly forced up the chain by the recursion.
         */
        if (parent->ondisk->count == 1) {
                /*
@@ -2128,6 +2135,7 @@ btree_remove(hammer_cursor_t cursor)
                 */
                error = hammer_cursor_up_locked(cursor);
                if (error == 0) {
+                       hammer_cursor_deleted_element(cursor->node, 0);
                        error = btree_remove(cursor);
                        if (error == 0) {
                                hammer_modify_node_all(cursor->trans, node);
index 1804607..f035f99 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.41.2.1 2008/08/02 21:24:27 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.41.2.2 2008/08/06 15:41:56 dillon Exp $
  */
 
 /*
@@ -497,7 +497,7 @@ hammer_unlock_cursor(hammer_cursor_t cursor, int also_ip)
        /*
         * Release the cursor's locks and track B-Tree operations on node.
         * While being tracked our cursor can be modified by other threads
-        * and node may be replaced.
+        * and the node may be replaced.
         */
        if (cursor->parent) {
                hammer_unlock(&cursor->parent->lock);
@@ -561,9 +561,15 @@ hammer_lock_cursor(hammer_cursor_t cursor, int also_ip)
        TAILQ_REMOVE(&node->cursor_list, cursor, deadlk_entry);
        cursor->flags &= ~HAMMER_CURSOR_TRACKED;
 
+       /*
+        * If a ripout has occured iterations must re-test the (new)
+        * current element.  Clearing ATEDISK prevents the element from
+        * being skipped and RETEST causes it to be re-tested.
+        */
        if (cursor->flags & HAMMER_CURSOR_TRACKED_RIPOUT) {
                cursor->flags &= ~HAMMER_CURSOR_TRACKED_RIPOUT;
                cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
+               cursor->flags |= HAMMER_CURSOR_RETEST;
        }
        error = hammer_load_cursor_parent(cursor, 0);
        return(error);
index 4629401..3001051 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.25.2.1 2008/08/02 21:24:27 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.25.2.2 2008/08/06 15:41:56 dillon Exp $
  */
 
 struct hammer_cmirror;
@@ -129,7 +129,7 @@ typedef struct hammer_cursor *hammer_cursor_t;
 #define HAMMER_CURSOR_ATEMEM           0x0200
 #define HAMMER_CURSOR_DISKEOF          0x0400
 #define HAMMER_CURSOR_MEMEOF           0x0800
-#define HAMMER_CURSOR_DELBTREE         0x1000  /* ip_delete from b-tree */
+#define HAMMER_CURSOR_RETEST           0x1000  /* retest current element */
 #define HAMMER_CURSOR_MIRROR_FILTERED  0x2000  /* mirror_tid filter */
 #define HAMMER_CURSOR_ASOF             0x4000  /* as-of lookup */
 #define HAMMER_CURSOR_CREATE_CHECK     0x8000  /* as-of lookup */
index 69b2efc..fac61e8 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103.2.3 2008/08/02 21:24:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.103.2.4 2008/08/06 15:41:56 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1873,19 +1873,19 @@ hammer_setup_child_callback(hammer_record_t rec, void *data)
 
                /*
                 * If the target IP is already flushing in our group
-                * we are golden, otherwise make sure the target
-                * reflushes.
+                * we could associate the record, but target_ip has
+                * already synced ino_data to sync_ino_data and we
+                * would also have to adjust nlinks.   Plus there are
+                * ordering issues for adds and deletes.
+                *
+                * Reflush downward if this is an ADD, and upward if
+                * this is a DEL.
                 */
                if (target_ip->flush_state == HAMMER_FST_FLUSH) {
-                       if (target_ip->flush_group == flg) {
-                               rec->flush_state = HAMMER_FST_FLUSH;
-                               rec->flush_group = flg;
-                               ++flg->refs;
-                               hammer_ref(&rec->lock);
-                               r = 1;
-                       } else {
+                       if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
+                               ip->flags |= HAMMER_INODE_REFLUSH;
+                       else
                                target_ip->flags |= HAMMER_INODE_REFLUSH;
-                       }
                        break;
                } 
 
index 6ca02b9..20127db 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.49.2.3 2008/08/02 21:24:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.49.2.4 2008/08/06 15:41:56 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -269,10 +269,14 @@ hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
        else
                bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
        if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
+               hammer_ref(&iou->io.lock);
                hammer_io_clear_modify(&iou->io, 1);
                bundirty(bp);
                iou->io.reclaim = 1;
-               hammer_io_deallocate(bp);
+               iou->io.waitdep = 1;
+               KKASSERT(iou->io.lock.refs == 0);
+               hammer_rel_buffer(&iou->buffer, 0);
+               /*hammer_io_deallocate(bp);*/
        } else {
                KKASSERT((bp->b_flags & B_LOCKED) == 0);
                bundirty(bp);
@@ -332,7 +336,9 @@ hammer_io_release(struct hammer_io *io, int flush)
        }
 
        /*
-        * Wait for the IO to complete if asked to.
+        * Wait for the IO to complete if asked to.  This occurs when
+        * the buffer must be disposed of definitively during an umount
+        * or buffer invalidation.
         */
        if (io->waitdep && io->running) {
                hammer_io_wait(io);
@@ -490,7 +496,9 @@ hammer_io_flush(struct hammer_io *io)
         * Do this before potentially blocking so any attempt to modify the
         * ondisk while we are blocked blocks waiting for us.
         */
+       hammer_ref(&io->lock);
        hammer_io_clear_modify(io, 0);
+       hammer_unref(&io->lock);
 
        /*
         * Transfer ownership to the kernel and initiate I/O.
@@ -650,6 +658,9 @@ hammer_modify_buffer_done(hammer_buffer_t buffer)
  * making bulk-modifications to the B-Tree.
  *
  * If inval is non-zero delayed adjustments are ignored.
+ *
+ * This routine may dereference related btree nodes and cause the
+ * buffer to be dereferenced.  The caller must own a reference on io.
  */
 void
 hammer_io_clear_modify(struct hammer_io *io, int inval)
@@ -698,7 +709,8 @@ restart:
                        goto restart;
                }
        }
-
+       /* caller must still have ref on io */
+       KKASSERT(io->lock.refs > 0);
 }
 
 /*
@@ -951,9 +963,18 @@ hammer_io_checkwrite(struct buf *bp)
        /*
         * We can only clear the modified bit if the IO is not currently
         * undergoing modification.  Otherwise we may miss changes.
+        *
+        * Only data and undo buffers can reach here.  These buffers do
+        * not have terminal crc functions but we temporarily reference
+        * the IO anyway, just in case.
         */
-       if (io->modify_refs == 0 && io->modified)
+       if (io->modify_refs == 0 && io->modified) {
+               hammer_ref(&io->lock);
                hammer_io_clear_modify(io, 0);
+               hammer_unref(&io->lock);
+       } else if (io->modified) {
+               KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER);
+       }
 
        /*
         * The kernel is going to start the IO, set io->running.
index 144bb29..e4d306c 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.90.2.3 2008/08/02 21:24:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.90.2.4 2008/08/06 15:41:56 dillon Exp $
  */
 
 #include "hammer.h"
@@ -1084,7 +1084,7 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
        if (record->type == HAMMER_MEM_RECORD_DEL) {
                error = hammer_btree_lookup(cursor);
                if (error == 0) {
-                       /* XXX iprec? */
+                       KKASSERT(cursor->iprec == NULL);
                        error = hammer_ip_delete_record(cursor, record->ip,
                                                        trans->tid);
                        if (error == 0) {
@@ -1312,7 +1312,7 @@ hammer_ip_first(hammer_cursor_t cursor)
        /*
         * Clean up fields and setup for merged scan
         */
-       cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
+       cursor->flags &= ~HAMMER_CURSOR_RETEST;
        cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
        cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
        if (cursor->iprec) {
@@ -1393,17 +1393,17 @@ next_btree:
         * records we have to get the next one. 
         *
         * If we deleted the last on-disk record we had scanned ATEDISK will
-        * be clear and DELBTREE will be set, forcing a call to iterate. The
+        * be clear and RETEST will be set, forcing a call to iterate.  The
         * fact that ATEDISK is clear causes iterate to re-test the 'current'
         * element.  If ATEDISK is set, iterate will skip the 'current'
         * element.
         *
         * Get the next on-disk record
         */
-       if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
+       if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_RETEST)) {
                if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
                        error = hammer_btree_iterate(cursor);
-                       cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
+                       cursor->flags &= ~HAMMER_CURSOR_RETEST;
                        if (error == 0) {
                                cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
                                hammer_cache_node(&cursor->ip->cache[1],
@@ -1750,8 +1750,8 @@ retry:
                 *
                 * This will also physically destroy the B-Tree entry and
                 * data if the retention policy dictates.  The function
-                * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
-                * uses to perform a fixup.
+                * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
+                * to retest the new 'current' element.
                 */
                if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
                        error = hammer_ip_delete_record(cursor, ip, trans->tid);
@@ -1877,8 +1877,8 @@ retry:
                 * Mark the record and B-Tree entry as deleted.  This will
                 * also physically delete the B-Tree entry, record, and
                 * data if the retention policy dictates.  The function
-                * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
-                * uses to perform a fixup.
+                * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
+                * to retest the new 'current' element.
                 *
                 * Directory entries (and delete-on-disk directory entries)
                 * must be synced and cannot be deleted.
@@ -2038,10 +2038,11 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
                 * Adjust for the iteration.  We have deleted the current
                 * element and want to clear ATEDISK so the iteration does
                 * not skip the element after, which now becomes the current
-                * element.
+                * element.  This element must be re-tested if doing an
+                * iteration, which is handled by the RETEST flag.
                 */
                if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
-                       cursor->flags |= HAMMER_CURSOR_DELBTREE;
+                       cursor->flags |= HAMMER_CURSOR_RETEST;
                        cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
                }
 
@@ -2076,12 +2077,14 @@ hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
                error = hammer_btree_delete(cursor);
                if (error == 0) {
                        /*
-                        * This forces a fixup for the iteration because
-                        * the cursor is now either sitting at the 'next'
-                        * element or sitting at the end of a leaf.
+                        * The deletion moves the next element (if any) to
+                        * the current element position.  We must clear
+                        * ATEDISK so this element is not skipped and we
+                        * must set RETEST to force any iteration to re-test
+                        * the element.
                         */
                        if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
-                               cursor->flags |= HAMMER_CURSOR_DELBTREE;
+                               cursor->flags |= HAMMER_CURSOR_RETEST;
                                cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
                        }
                }
index de20a32..021cddf 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.69.2.4 2008/08/02 21:24:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.69.2.5 2008/08/06 15:41:56 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -653,8 +653,9 @@ found:
 /*
  * This is used by the direct-read code to deal with large-data buffers
  * created by the reblocker and mirror-write code.  The direct-read code
- * bypasses the HAMMER buffer subsystem and so any aliased dirty hammer
- * buffers must be fully synced to disk before we can issue the direct-read.
+ * bypasses the HAMMER buffer subsystem and so any aliased dirty or write-
+ * running hammer buffers must be fully synced to disk before we can issue
+ * the direct-read.
  *
  * This code path is not considered critical as only the rebocker and
  * mirror-write code will create large-data buffers via the HAMMER buffer
@@ -673,13 +674,16 @@ hammer_sync_buffers(hammer_mount_t hmp, hammer_off_t base_offset, int bytes)
        while (bytes > 0) {
                buffer = RB_LOOKUP(hammer_buf_rb_tree, &hmp->rb_bufs_root,
                                   base_offset);
-               if (buffer && buffer->io.modified) {
+               if (buffer && (buffer->io.modified || buffer->io.running)) {
                        error = hammer_ref_buffer(buffer);
-                       if (error == 0 && buffer->io.modified) {
-                               hammer_io_write_interlock(&buffer->io);
-                               hammer_io_flush(&buffer->io);
-                               hammer_io_done_interlock(&buffer->io);
+                       if (error == 0) {
                                hammer_io_wait(&buffer->io);
+                               if (buffer->io.modified) {
+                                       hammer_io_write_interlock(&buffer->io);
+                                       hammer_io_flush(&buffer->io);
+                                       hammer_io_done_interlock(&buffer->io);
+                                       hammer_io_wait(&buffer->io);
+                               }
                                hammer_rel_buffer(buffer, 0);
                        }
                }
@@ -718,6 +722,7 @@ hammer_del_buffers(hammer_mount_t hmp, hammer_off_t base_offset,
                                KKASSERT(buffer->zone2_offset == zone2_offset);
                                hammer_io_clear_modify(&buffer->io, 1);
                                buffer->io.reclaim = 1;
+                               buffer->io.waitdep = 1;
                                KKASSERT(buffer->volume == volume);
                                hammer_rel_buffer(buffer, 0);
                        }
index 9c2dfda..e98d5bb 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.91.2.3 2008/08/02 21:24:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.91.2.4 2008/08/06 15:41:56 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -1652,6 +1652,8 @@ hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
        int error;
 
        dip = VTOI(ap->a_dvp);
+       if (dip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY)
+               return(ENOTDIR);
 
        if (hammer_nohistory(dip) == 0 &&
            (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {