From e98f1b96f17c57d1d59a4bba1d6a26b281767c07 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 19 Feb 2010 10:41:22 -0800 Subject: [PATCH] HAMMER VFS - Reduce stalls during bulk file operations * Track modifying inode operations on a per-PID basis (loosely) and call hammer_inode_wait_reclaims() earlier for those pids. The algorithm selects a wait point based on the process's perceived contribution to the inode load. The greater the contribution, the more readily we stall the process in order to wait for related reclaims to process. Processes with lower loads have higher reclaim points and do not stall as readily as they did before. * Remove waitreclaims calls based on B-Tree scans. I'm not sure why I had this in there but it was creating an excessive number of unnecessary stalls, so if any problems crop up I'll have to find another way to deal with them. * These changes (particularly the first) should reduce unnecessary stalls for the programs not doing heavy inode operations. Hopefully that means rm -rf and tar extractions will not have as quite the detrimental effect on other processes as they did before. --- sys/vfs/hammer/hammer.h | 20 +++++++-- sys/vfs/hammer/hammer_inode.c | 86 ++++++++++++++++++++++++++++++++++--- sys/vfs/hammer/hammer_object.c | 10 ++--- sys/vfs/hammer/hammer_prune.c | 2 +- sys/vfs/hammer/hammer_transaction.c | 7 +-- sys/vfs/hammer/hammer_vnops.c | 12 +++--- 6 files changed, 113 insertions(+), 24 deletions(-) diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index b41a16c02e..44afd336fd 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -427,6 +427,19 @@ struct hammer_reclaim { #define HAMMER_RECLAIM_WAIT 4000 /* default vfs.hammer.limit_reclaim */ /* + * Track who is creating the greatest burden on the + * inode cache. + */ +struct hammer_inostats { + pid_t pid; /* track user process */ + int ltick; /* last tick */ + int count; /* count (degenerates) */ +}; + +#define HAMMER_INOSTATS_HSIZE 32 +#define HAMMER_INOSTATS_HMASK (HAMMER_INOSTATS_HSIZE - 1) + +/* * Structure used to represent an unsynchronized record in-memory. These * records typically represent directory entries. Only non-historical * records are kept in-memory. @@ -844,6 +857,8 @@ struct hammer_mount { TAILQ_HEAD(, hammer_objid_cache) objid_cache_list; TAILQ_HEAD(, hammer_reclaim) reclaim_list; TAILQ_HEAD(, hammer_io) iorun_list; + + struct hammer_inostats inostats[HAMMER_INOSTATS_HSIZE]; }; typedef struct hammer_mount *hammer_mount_t; @@ -968,8 +983,7 @@ void hammer_scan_inode_snapshots(hammer_mount_t hmp, void *data); void hammer_put_inode(struct hammer_inode *ip); void hammer_put_inode_ref(struct hammer_inode *ip); -void hammer_inode_waitreclaims(hammer_mount_t hmp); -void hammer_inode_waithard(hammer_mount_t hmp); +void hammer_inode_waitreclaims(hammer_transaction_t trans); int hammer_unload_volume(hammer_volume_t volume, void *data __unused); int hammer_adjust_volume_mode(hammer_volume_t volume, void *data __unused); @@ -1211,7 +1225,7 @@ void hammer_start_transaction_fls(struct hammer_transaction *trans, void hammer_done_transaction(struct hammer_transaction *trans); hammer_tid_t hammer_alloc_tid(hammer_mount_t hmp, int count); -void hammer_modify_inode(hammer_inode_t ip, int flags); +void hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags); void hammer_flush_inode(hammer_inode_t ip, int flags); void hammer_flush_inode_done(hammer_inode_t ip, int error); void hammer_wait_inode(hammer_inode_t ip); diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 5fd1a34ac0..0537d11d17 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -50,6 +50,8 @@ static int hammer_setup_parent_inodes(hammer_inode_t ip, int depth, static int hammer_setup_parent_inodes_helper(hammer_record_t record, int depth, hammer_flush_group_t flg); static void hammer_inode_wakereclaims(hammer_inode_t ip); +static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp, + pid_t pid); #ifdef DEBUG_TRUNCATE extern struct hammer_inode *HammerTruncIp; @@ -563,7 +565,13 @@ retry: ip = NULL; } hammer_done_cursor(&cursor); - trans->flags |= HAMMER_TRANSF_NEWINODE; + + /* + * NEWINODE is only set if the inode becomes dirty later, + * setting it here just leads to unnecessary stalls. + * + * trans->flags |= HAMMER_TRANSF_NEWINODE; + */ return (ip); } @@ -1091,7 +1099,7 @@ hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred, pfsm, &ip); if (error == 0) { ++ip->ino_data.nlinks; - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); } } if (ip) @@ -1586,7 +1594,7 @@ hammer_reload_inode(hammer_inode_t ip, void *arg __unused) * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated */ void -hammer_modify_inode(hammer_inode_t ip, int flags) +hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags) { /* * ronly of 0 or 2 does not trigger assertion. @@ -1602,6 +1610,17 @@ hammer_modify_inode(hammer_inode_t ip, int flags) ++ip->hmp->rsv_inodes; } + /* + * Set the NEWINODE flag in the transaction if the inode + * transitions to a dirty state. This is used to track + * the load on the inode cache. + */ + if (trans && + (ip->flags & HAMMER_INODE_MODMASK) == 0 && + (flags & HAMMER_INODE_MODMASK)) { + trans->flags |= HAMMER_TRANSF_NEWINODE; + } + ip->flags |= flags; } @@ -3120,12 +3139,36 @@ hammer_inode_wakereclaims(hammer_inode_t ip) * as lone as one does. */ void -hammer_inode_waitreclaims(hammer_mount_t hmp) +hammer_inode_waitreclaims(hammer_transaction_t trans) { + hammer_mount_t hmp = trans->hmp; struct hammer_reclaim reclaim; - if (hmp->inode_reclaims < hammer_limit_reclaim) - return; + /* + * Track inode load + */ + if (curthread->td_proc) { + struct hammer_inostats *stats; + int lower_limit; + + stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid); + ++stats->count; + + if (stats->count > hammer_limit_reclaim / 2) + stats->count = hammer_limit_reclaim / 2; + lower_limit = hammer_limit_reclaim - stats->count; + if (hammer_debug_general & 0x10000) + kprintf("pid %5d limit %d\n", (int)curthread->td_proc->p_pid, lower_limit); + + if (hmp->inode_reclaims < lower_limit) + return; + } else { + /* + * Default mode + */ + if (hmp->inode_reclaims < hammer_limit_reclaim) + return; + } reclaim.count = 1; TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry); tsleep(&reclaim, 0, "hmrrcm", hz); @@ -3133,6 +3176,37 @@ hammer_inode_waitreclaims(hammer_mount_t hmp) TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry); } +static +struct hammer_inostats * +hammer_inode_inostats(hammer_mount_t hmp, pid_t pid) +{ + struct hammer_inostats *stats; + int delta; + int chain; + + for (chain = 0; chain < 4; ++chain) { + stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK]; + if (stats->pid == pid) + break; + } + if (chain == 4) { + stats = &hmp->inostats[(pid + ticks) & HAMMER_INOSTATS_HMASK]; + stats->pid = pid; + } + + if (stats->count && stats->ltick != ticks) { + delta = ticks - stats->ltick; + stats->ltick = ticks; + if (delta <= 0 || delta > hz * 60) + stats->count = 0; + else + stats->count = stats->count * hz / (hz + delta); + } + if (hammer_debug_general & 0x10000) + kprintf("pid %5d stats %d\n", (int)pid, stats->count); + return (stats); +} + #if 0 /* diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 6f578db67a..ac4d8e69d2 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -679,7 +679,7 @@ hammer_ip_add_directory(struct hammer_transaction *trans, ++ip->ino_data.nlinks; ip->ino_data.ctime = trans->time; - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); /* * Find an unused namekey. Both the in-memory record tree and @@ -735,7 +735,7 @@ hammer_ip_add_directory(struct hammer_transaction *trans, error = hammer_mem_add(record); if (error == 0) { dip->ino_data.mtime = trans->time; - hammer_modify_inode(dip, HAMMER_INODE_MTIME); + hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); } failed: hammer_done_cursor(&cursor); @@ -850,9 +850,9 @@ hammer_ip_del_directory(struct hammer_transaction *trans, ip->ino_data.ctime = trans->time; } dip->ino_data.mtime = trans->time; - hammer_modify_inode(dip, HAMMER_INODE_MTIME); + hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); if (ip) { - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); if (ip->ino_data.nlinks == 0 && (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { hammer_done_cursor(cursor); @@ -1381,7 +1381,7 @@ hammer_mem_add(hammer_record_t record) ++record->ip->rsv_recs; record->ip->hmp->rsv_databytes += record->leaf.data_len; record->flags |= HAMMER_RECF_ONRBTREE; - hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY); + hammer_modify_inode(NULL, record->ip, HAMMER_INODE_XDIRTY); hammer_rel_mem_record(record); return(0); } diff --git a/sys/vfs/hammer/hammer_prune.c b/sys/vfs/hammer/hammer_prune.c index c0dca4090c..aed0de9db3 100644 --- a/sys/vfs/hammer/hammer_prune.c +++ b/sys/vfs/hammer/hammer_prune.c @@ -324,7 +324,7 @@ prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) (long long)elm->base.obj_id); } hammer_rel_inode(ip, 0); - hammer_inode_waitreclaims(cursor->trans->hmp); + hammer_inode_waitreclaims(cursor->trans); } else { kprintf("unable to prune disconnected inode %016llx\n", (long long)elm->base.obj_id); diff --git a/sys/vfs/hammer/hammer_transaction.c b/sys/vfs/hammer/hammer_transaction.c index a3fb181293..5d668be0ee 100644 --- a/sys/vfs/hammer/hammer_transaction.c +++ b/sys/vfs/hammer/hammer_transaction.c @@ -118,7 +118,6 @@ hammer_start_transaction_fls(struct hammer_transaction *trans, void hammer_done_transaction(struct hammer_transaction *trans) { - hammer_mount_t hmp = trans->hmp; int expected_lock_refs; hammer_rel_volume(trans->rootvol, 0); @@ -128,9 +127,11 @@ hammer_done_transaction(struct hammer_transaction *trans) trans->sync_lock_refs = 0; if (trans->type != HAMMER_TRANS_FLS) { if (trans->flags & HAMMER_TRANSF_NEWINODE) - hammer_inode_waitreclaims(hmp); + hammer_inode_waitreclaims(trans); + /* else if (trans->flags & HAMMER_TRANSF_DIDIO) - hammer_inode_waitreclaims(hmp); + hammer_inode_waitreclaims(trans); + */ } } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index f43c003878..460a347adc 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -441,7 +441,7 @@ skip: if ((ip->flags & HAMMER_INODE_RO) == 0 && (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) { ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); } hammer_done_transaction(&trans); if (got_mplock > 0) @@ -742,7 +742,7 @@ hammer_vop_write(struct vop_write_args *ap) } ip->ino_data.mtime = trans.time; flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS; - hammer_modify_inode(ip, flags); + hammer_modify_inode(&trans, ip, flags); /* * Once we dirty the buffer any cached zone-X offset @@ -1893,7 +1893,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap) if (error == 0) { ip->ino_data.parent_obj_id = tdip->obj_id; ip->ino_data.ctime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY); } } if (error) @@ -2030,7 +2030,7 @@ hammer_vop_markatime(struct vop_markatime_args *ap) ++hammer_stats_file_iopsw; ip->ino_data.atime = trans.time; - hammer_modify_inode(ip, HAMMER_INODE_ATIME); + hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, NOTE_ATTRIB); return (0); @@ -2265,7 +2265,7 @@ hammer_vop_setattr(struct vop_setattr_args *ap) } done: if (error == 0) - hammer_modify_inode(ip, modflags); + hammer_modify_inode(&trans, ip, modflags); hammer_done_transaction(&trans); hammer_knote(ap->a_vp, kflags); return (error); @@ -2344,7 +2344,7 @@ hammer_vop_nsymlink(struct vop_nsymlink_args *ap) */ if (error == 0) { nip->ino_data.size = bytes; - hammer_modify_inode(nip, HAMMER_INODE_DDIRTY); + hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY); } } if (error == 0) -- 2.11.4.GIT