fs/notify/inode_mark.c

   1 /*
   2  *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
   3  *
   4  *  This program is free software; you can redistribute it and/or modify
   5  *  it under the terms of the GNU General Public License as published by
   6  *  the Free Software Foundation; either version 2, or (at your option)
   7  *  any later version.
   8  *
   9  *  This program is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *  GNU General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU General Public License
  15  *  along with this program; see the file COPYING.  If not, write to
  16  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  17  */
  18
  19 /*
  20  * fsnotify inode mark locking/lifetime/and refcnting
  21  *
  22  * REFCNT:
  23  * The mark->refcnt tells how many "things" in the kernel currently are
  24  * referencing this object.  The object typically will live inside the kernel
  25  * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
  26  * which can find this object holding the appropriete locks, can take a reference
  27  * and the object itself is guarenteed to survive until the reference is dropped.
  28  *
  29  * LOCKING:
  30  * There are 3 spinlocks involved with fsnotify inode marks and they MUST
  31  * be taken in order as follows:
  32  *
  33  * entry->lock
  34  * group->mark_lock
  35  * inode->i_lock
  36  *
  37  * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
  38  * that lock to dereference either of these things (they could be NULL even with
  39  * the lock)
  40  *
  41  * group->mark_lock protects the mark_entries list anchored inside a given group
  42  * and each entry is hooked via the g_list.  It also sorta protects the
  43  * free_g_list, which when used is anchored by a private list on the stack of the
  44  * task which held the group->mark_lock.
  45  *
  46  * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
  47  * given inode and each entry is hooked via the i_list. (and sorta the
  48  * free_i_list)
  49  *
  50  *
  51  * LIFETIME:
  52  * Inode marks survive between when they are added to an inode and when their
  53  * refcnt==0.
  54  *
  55  * The inode mark can be cleared for a number of different reasons including:
  56  * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
  57  * - The inode is being evicted from cache. (fsnotify_inode_delete)
  58  * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
  59  * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
  60  * - The fsnotify_group associated with the mark is going away and all such marks
  61  *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  62  *
  63  * Worst case we are given an inode and need to clean up all the marks on that
  64  * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
  65  * mark on the list we take a reference (so the mark can't disappear under us).
  66  * We remove that mark form the inode's list of marks and we add this mark to a
  67  * private list anchored on the stack using i_free_list;  At this point we no
  68  * longer fear anything finding the mark using the inode's list of marks.
  69  *
  70  * We can safely and locklessly run the private list on the stack of everything
  71  * we just unattached from the original inode.  For each mark on the private list
  72  * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
  73  * we see the group and inode are not NULL we take those locks.  Now holding all
  74  * 3 locks we can completely remove the mark from other tasks finding it in the
  75  * future.  Remember, 10 things might already be referencing this mark, but they
  76  * better be holding a ref.  We drop our reference we took before we unhooked it
  77  * from the inode.  When the ref hits 0 we can free the mark.
  78  *
  79  * Very similarly for freeing by group, except we use free_g_list.
  80  *
  81  * This has the very interesting property of being able to run concurrently with
  82  * any (or all) other directions.
  83  */
  84
  85 #include <linux/fs.h>
  86 #include <linux/init.h>
  87 #include <linux/kernel.h>
  88 #include <linux/module.h>
  89 #include <linux/mutex.h>
  90 #include <linux/slab.h>
  91 #include <linux/spinlock.h>
  92 #include <linux/writeback.h> /* for inode_lock */
  93
  94 #include <asm/atomic.h>
  95
  96 #include <linux/fsnotify_backend.h>
  97 #include "fsnotify.h"
  98
  99 void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
 100 {
 101         atomic_inc(&entry->refcnt);
 102 }
 103
 104 void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
 105 {
 106         if (atomic_dec_and_test(&entry->refcnt))
 107                 entry->free_mark(entry);
 108 }
 109
 110 /*
 111  * Recalculate the mask of events relevant to a given inode locked.
 112  */
 113 static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
 114 {
 115         struct fsnotify_mark_entry *entry;
 116         struct hlist_node *pos;
 117         __u32 new_mask = 0;
 118
 119         assert_spin_locked(&inode->i_lock);
 120
 121         hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
 122                 new_mask |= entry->mask;
 123         inode->i_fsnotify_mask = new_mask;
 124 }
 125
 126 /*
 127  * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
 128  * any notifier is interested in hearing for this inode.
 129  */
 130 void fsnotify_recalc_inode_mask(struct inode *inode)
 131 {
 132         spin_lock(&inode->i_lock);
 133         fsnotify_recalc_inode_mask_locked(inode);
 134         spin_unlock(&inode->i_lock);
 135
 136         __fsnotify_update_child_dentry_flags(inode);
 137 }
 138
 139 /*
 140  * Any time a mark is getting freed we end up here.
 141  * The caller had better be holding a reference to this mark so we don't actually
 142  * do the final put under the entry->lock
 143  */
 144 void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 145 {
 146         struct fsnotify_group *group;
 147         struct inode *inode;
 148
 149         spin_lock(&entry->lock);
 150
 151         group = entry->group;
 152         inode = entry->inode;
 153
 154         BUG_ON(group && !inode);
 155         BUG_ON(!group && inode);
 156
 157         /* if !group something else already marked this to die */
 158         if (!group) {
 159                 spin_unlock(&entry->lock);
 160                 return;
 161         }
 162
 163         /* 1 from caller and 1 for being on i_list/g_list */
 164         BUG_ON(atomic_read(&entry->refcnt) < 2);
 165
 166         spin_lock(&group->mark_lock);
 167         spin_lock(&inode->i_lock);
 168
 169         hlist_del_init(&entry->i_list);
 170         entry->inode = NULL;
 171
 172         list_del_init(&entry->g_list);
 173         entry->group = NULL;
 174
 175         fsnotify_put_mark(entry); /* for i_list and g_list */
 176
 177         /*
 178          * this mark is now off the inode->i_fsnotify_mark_entries list and we
 179          * hold the inode->i_lock, so this is the perfect time to update the
 180          * inode->i_fsnotify_mask
 181          */
 182         fsnotify_recalc_inode_mask_locked(inode);
 183
 184         spin_unlock(&inode->i_lock);
 185         spin_unlock(&group->mark_lock);
 186         spin_unlock(&entry->lock);
 187
 188         /*
 189          * Some groups like to know that marks are being freed.  This is a
 190          * callback to the group function to let it know that this entry
 191          * is being freed.
 192          */
 193         if (group->ops->freeing_mark)
 194                 group->ops->freeing_mark(entry, group);
 195
 196         /*
 197          * __fsnotify_update_child_dentry_flags(inode);
 198          *
 199          * I really want to call that, but we can't, we have no idea if the inode
 200          * still exists the second we drop the entry->lock.
 201          *
 202          * The next time an event arrive to this inode from one of it's children
 203          * __fsnotify_parent will see that the inode doesn't care about it's
 204          * children and will update all of these flags then.  So really this
 205          * is just a lazy update (and could be a perf win...)
 206          */
 207
 208
 209         iput(inode);
 210
 211         /*
 212          * it's possible that this group tried to destroy itself, but this
 213          * this mark was simultaneously being freed by inode.  If that's the
 214          * case, we finish freeing the group here.
 215          */
 216         if (unlikely(atomic_dec_and_test(&group->num_marks)))
 217                 fsnotify_final_destroy_group(group);
 218 }
 219
 220 /*
 221  * Given a group, destroy all of the marks associated with that group.
 222  */
 223 void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
 224 {
 225         struct fsnotify_mark_entry *lentry, *entry;
 226         LIST_HEAD(free_list);
 227
 228         spin_lock(&group->mark_lock);
 229         list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
 230                 list_add(&entry->free_g_list, &free_list);
 231                 list_del_init(&entry->g_list);
 232                 fsnotify_get_mark(entry);
 233         }
 234         spin_unlock(&group->mark_lock);
 235
 236         list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
 237                 fsnotify_destroy_mark_by_entry(entry);
 238                 fsnotify_put_mark(entry);
 239         }
 240 }
 241
 242 /*
 243  * Given an inode, destroy all of the marks associated with that inode.
 244  */
 245 void fsnotify_clear_marks_by_inode(struct inode *inode)
 246 {
 247         struct fsnotify_mark_entry *entry, *lentry;
 248         struct hlist_node *pos, *n;
 249         LIST_HEAD(free_list);
 250
 251         spin_lock(&inode->i_lock);
 252         hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
 253                 list_add(&entry->free_i_list, &free_list);
 254                 hlist_del_init(&entry->i_list);
 255                 fsnotify_get_mark(entry);
 256         }
 257         spin_unlock(&inode->i_lock);
 258
 259         list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
 260                 fsnotify_destroy_mark_by_entry(entry);
 261                 fsnotify_put_mark(entry);
 262         }
 263 }
 264
 265 /*
 266  * given a group and inode, find the mark associated with that combination.
 267  * if found take a reference to that mark and return it, else return NULL
 268  */
 269 struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
 270                                                      struct inode *inode)
 271 {
 272         struct fsnotify_mark_entry *entry;
 273         struct hlist_node *pos;
 274
 275         assert_spin_locked(&inode->i_lock);
 276
 277         hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
 278                 if (entry->group == group) {
 279                         fsnotify_get_mark(entry);
 280                         return entry;
 281                 }
 282         }
 283         return NULL;
 284 }
 285
 286 /*
 287  * Nothing fancy, just initialize lists and locks and counters.
 288  */
 289 void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
 290                         void (*free_mark)(struct fsnotify_mark_entry *entry))
 291
 292 {
 293         spin_lock_init(&entry->lock);
 294         atomic_set(&entry->refcnt, 1);
 295         INIT_HLIST_NODE(&entry->i_list);
 296         entry->group = NULL;
 297         entry->mask = 0;
 298         entry->inode = NULL;
 299         entry->free_mark = free_mark;
 300 }
 301
 302 /*
 303  * Attach an initialized mark entry to a given group and inode.
 304  * These marks may be used for the fsnotify backend to determine which
 305  * event types should be delivered to which group and for which inodes.
 306  */
 307 int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 308                       struct fsnotify_group *group, struct inode *inode)
 309 {
 310         struct fsnotify_mark_entry *lentry;
 311         int ret = 0;
 312
 313         inode = igrab(inode);
 314         if (unlikely(!inode))
 315                 return -EINVAL;
 316
 317         /*
 318          * LOCKING ORDER!!!!
 319          * entry->lock
 320          * group->mark_lock
 321          * inode->i_lock
 322          */
 323         spin_lock(&entry->lock);
 324         spin_lock(&group->mark_lock);
 325         spin_lock(&inode->i_lock);
 326
 327         lentry = fsnotify_find_mark_entry(group, inode);
 328         if (!lentry) {
 329                 entry->group = group;
 330                 entry->inode = inode;
 331
 332                 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
 333                 list_add(&entry->g_list, &group->mark_entries);
 334
 335                 fsnotify_get_mark(entry); /* for i_list and g_list */
 336
 337                 atomic_inc(&group->num_marks);
 338
 339                 fsnotify_recalc_inode_mask_locked(inode);
 340         }
 341
 342         spin_unlock(&inode->i_lock);
 343         spin_unlock(&group->mark_lock);
 344         spin_unlock(&entry->lock);
 345
 346         if (lentry) {
 347                 ret = -EEXIST;
 348                 iput(inode);
 349                 fsnotify_put_mark(lentry);
 350         } else {
 351                 __fsnotify_update_child_dentry_flags(inode);
 352         }
 353
 354         return ret;
 355 }
 356
 357 /**
 358  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 359  * @list: list of inodes being unmounted (sb->s_inodes)
 360  *
 361  * Called with inode_lock held, protecting the unmounting super block's list
 362  * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
 363  * We temporarily drop inode_lock, however, and CAN block.
 364  */
 365 void fsnotify_unmount_inodes(struct list_head *list)
 366 {
 367         struct inode *inode, *next_i, *need_iput = NULL;
 368
 369         list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 370                 struct inode *need_iput_tmp;
 371
 372                 /*
 373                  * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
 374                  * I_WILL_FREE, or I_NEW which is fine because by that point
 375                  * the inode cannot have any associated watches.
 376                  */
 377                 if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
 378                         continue;
 379
 380                 /*
 381                  * If i_count is zero, the inode cannot have any watches and
 382                  * doing an __iget/iput with MS_ACTIVE clear would actually
 383                  * evict all inodes with zero i_count from icache which is
 384                  * unnecessarily violent and may in fact be illegal to do.
 385                  */
 386                 if (!atomic_read(&inode->i_count))
 387                         continue;
 388
 389                 need_iput_tmp = need_iput;
 390                 need_iput = NULL;
 391
 392                 /* In case fsnotify_inode_delete() drops a reference. */
 393                 if (inode != need_iput_tmp)
 394                         __iget(inode);
 395                 else
 396                         need_iput_tmp = NULL;
 397
 398                 /* In case the dropping of a reference would nuke next_i. */
 399                 if ((&next_i->i_sb_list != list) &&
 400                     atomic_read(&next_i->i_count) &&
 401                     !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
 402                         __iget(next_i);
 403                         need_iput = next_i;
 404                 }
 405
 406                 /*
 407                  * We can safely drop inode_lock here because we hold
 408                  * references on both inode and next_i.  Also no new inodes
 409                  * will be added since the umount has begun.  Finally,
 410                  * iprune_mutex keeps shrink_icache_memory() away.
 411                  */
 412                 spin_unlock(&inode_lock);
 413
 414                 if (need_iput_tmp)
 415                         iput(need_iput_tmp);
 416
 417                 /* for each watch, send FS_UNMOUNT and then remove it */
 418                 fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 419
 420                 fsnotify_inode_delete(inode);
 421
 422                 iput(inode);
 423
 424                 spin_lock(&inode_lock);
 425         }
 426 }