sys/kern/vfs_sync.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)vfs_subr.c  8.31 (Berkeley) 5/26/95
  39  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
  40  * $DragonFly: src/sys/kern/vfs_sync.c,v 1.18 2008/05/18 05:54:25 dillon Exp $
  41  */
  42
  43 /*
  44  * External virtual filesystem routines
  45  */
  46 #include "opt_ddb.h"
  47
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/buf.h>
  51 #include <sys/conf.h>
  52 #include <sys/dirent.h>
  53 #include <sys/domain.h>
  54 #include <sys/eventhandler.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/kernel.h>
  57 #include <sys/kthread.h>
  58 #include <sys/malloc.h>
  59 #include <sys/mbuf.h>
  60 #include <sys/mount.h>
  61 #include <sys/proc.h>
  62 #include <sys/namei.h>
  63 #include <sys/reboot.h>
  64 #include <sys/socket.h>
  65 #include <sys/stat.h>
  66 #include <sys/sysctl.h>
  67 #include <sys/syslog.h>
  68 #include <sys/vmmeter.h>
  69 #include <sys/vnode.h>
  70
  71 #include <machine/limits.h>
  72
  73 #include <vm/vm.h>
  74 #include <vm/vm_object.h>
  75 #include <vm/vm_extern.h>
  76 #include <vm/vm_kern.h>
  77 #include <vm/pmap.h>
  78 #include <vm/vm_map.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/vm_pager.h>
  81 #include <vm/vnode_pager.h>
  82
  83 #include <sys/buf2.h>
  84 #include <sys/thread2.h>
  85
  86 /*
  87  * The workitem queue.
  88  */
  89 #define SYNCER_MAXDELAY         32
  90 static int syncer_maxdelay = SYNCER_MAXDELAY;   /* maximum delay time */
  91 time_t syncdelay = 30;          /* max time to delay syncing data */
  92 SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW,
  93                 &syncdelay, 0, "VFS data synchronization delay");
  94 time_t filedelay = 30;          /* time to delay syncing files */
  95 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW,
  96                 &filedelay, 0, "File synchronization delay");
  97 time_t dirdelay = 29;           /* time to delay syncing directories */
  98 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW,
  99                 &dirdelay, 0, "Directory synchronization delay");
 100 time_t metadelay = 28;          /* time to delay syncing metadata */
 101 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW,
 102                 &metadelay, 0, "VFS metadata synchronization delay");
 103 static int rushjob;                     /* number of slots to run ASAP */
 104 static int stat_rush_requests;  /* number of times I/O speeded up */
 105 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW,
 106                 &stat_rush_requests, 0, "");
 107
 108 static int syncer_delayno = 0;
 109 static long syncer_mask;
 110 static struct lwkt_token syncer_token;
 111 LIST_HEAD(synclist, vnode);
 112 static struct synclist *syncer_workitem_pending;
 113
 114 /*
 115  * Called from vfsinit()
 116  */
 117 void
 118 vfs_sync_init(void)
 119 {
 120         syncer_workitem_pending = hashinit(syncer_maxdelay, M_DEVBUF,
 121                                             &syncer_mask);
 122         syncer_maxdelay = syncer_mask + 1;
 123         lwkt_token_init(&syncer_token);
 124 }
 125
 126 /*
 127  * The workitem queue.
 128  *
 129  * It is useful to delay writes of file data and filesystem metadata
 130  * for tens of seconds so that quickly created and deleted files need
 131  * not waste disk bandwidth being created and removed. To realize this,
 132  * we append vnodes to a "workitem" queue. When running with a soft
 133  * updates implementation, most pending metadata dependencies should
 134  * not wait for more than a few seconds. Thus, mounted on block devices
 135  * are delayed only about a half the time that file data is delayed.
 136  * Similarly, directory updates are more critical, so are only delayed
 137  * about a third the time that file data is delayed. Thus, there are
 138  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
 139  * one each second (driven off the filesystem syncer process). The
 140  * syncer_delayno variable indicates the next queue that is to be processed.
 141  * Items that need to be processed soon are placed in this queue:
 142  *
 143  *      syncer_workitem_pending[syncer_delayno]
 144  *
 145  * A delay of fifteen seconds is done by placing the request fifteen
 146  * entries later in the queue:
 147  *
 148  *      syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
 149  *
 150  */
 151
 152 /*
 153  * Add an item to the syncer work queue.
 154  *
 155  * MPSAFE
 156  */
 157 void
 158 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 159 {
 160         lwkt_tokref ilock;
 161         int slot;
 162
 163         lwkt_gettoken(&ilock, &syncer_token);
 164
 165         if (vp->v_flag & VONWORKLST)
 166                 LIST_REMOVE(vp, v_synclist);
 167         if (delay > syncer_maxdelay - 2)
 168                 delay = syncer_maxdelay - 2;
 169         slot = (syncer_delayno + delay) & syncer_mask;
 170
 171         LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 172         vsetflags(vp, VONWORKLST);
 173
 174         lwkt_reltoken(&ilock);
 175 }
 176
 177 struct  thread *updatethread;
 178 static void sched_sync (void);
 179 static struct kproc_desc up_kp = {
 180         "syncer",
 181         sched_sync,
 182         &updatethread
 183 };
 184 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 185
 186 /*
 187  * System filesystem synchronizer daemon.
 188  */
 189 void
 190 sched_sync(void)
 191 {
 192         struct thread *td = curthread;
 193         struct synclist *slp;
 194         struct vnode *vp;
 195         lwkt_tokref ilock;
 196         lwkt_tokref vlock;
 197         long starttime;
 198
 199         EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
 200             SHUTDOWN_PRI_LAST);
 201
 202         for (;;) {
 203                 kproc_suspend_loop();
 204
 205                 starttime = time_second;
 206                 lwkt_gettoken(&ilock, &syncer_token);
 207
 208                 /*
 209                  * Push files whose dirty time has expired.  Be careful
 210                  * of interrupt race on slp queue.
 211                  */
 212                 slp = &syncer_workitem_pending[syncer_delayno];
 213                 syncer_delayno += 1;
 214                 if (syncer_delayno == syncer_maxdelay)
 215                         syncer_delayno = 0;
 216
 217                 while ((vp = LIST_FIRST(slp)) != NULL) {
 218                         if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 219                                 VOP_FSYNC(vp, MNT_LAZY, 0);
 220                                 vput(vp);
 221                         }
 222
 223                         /*
 224                          * If the vnode is still at the head of the list
 225                          * we were not able to completely flush it.  To
 226                          * give other vnodes a fair shake we move it to
 227                          * a later slot.
 228                          *
 229                          * Note that v_tag VT_VFS vnodes can remain on the
 230                          * worklist with no dirty blocks, but sync_fsync()
 231                          * moves it to a later slot so we will never see it
 232                          * here.
 233                          */
 234                         if (LIST_FIRST(slp) == vp) {
 235                                 lwkt_gettoken(&vlock, &vp->v_token);
 236                                 if (LIST_FIRST(slp) == vp) {
 237                                         if (RB_EMPTY(&vp->v_rbdirty_tree) &&
 238                                             !vn_isdisk(vp, NULL)) {
 239                                                 panic("sched_sync: fsync "
 240                                                       "failed vp %p tag %d",
 241                                                       vp, vp->v_tag);
 242                                         }
 243                                         vn_syncer_add_to_worklist(vp, syncdelay);
 244                                 }
 245                                 lwkt_reltoken(&vlock);
 246                         }
 247                 }
 248                 lwkt_reltoken(&ilock);
 249
 250                 /*
 251                  * Do sync processing for each mount.
 252                  */
 253                 bio_ops_sync(NULL);
 254
 255                 /*
 256                  * The variable rushjob allows the kernel to speed up the
 257                  * processing of the filesystem syncer process. A rushjob
 258                  * value of N tells the filesystem syncer to process the next
 259                  * N seconds worth of work on its queue ASAP. Currently rushjob
 260                  * is used by the soft update code to speed up the filesystem
 261                  * syncer process when the incore state is getting so far
 262                  * ahead of the disk that the kernel memory pool is being
 263                  * threatened with exhaustion.
 264                  */
 265                 if (rushjob > 0) {
 266                         rushjob -= 1;
 267                         continue;
 268                 }
 269                 /*
 270                  * If it has taken us less than a second to process the
 271                  * current work, then wait. Otherwise start right over
 272                  * again. We can still lose time if any single round
 273                  * takes more than two seconds, but it does not really
 274                  * matter as we are just trying to generally pace the
 275                  * filesystem activity.
 276                  */
 277                 if (time_second == starttime)
 278                         tsleep(&lbolt_syncer, 0, "syncer", 0);
 279         }
 280 }
 281
 282 /*
 283  * Request the syncer daemon to speed up its work.
 284  * We never push it to speed up more than half of its
 285  * normal turn time, otherwise it could take over the cpu.
 286  *
 287  * YYY wchan field protected by the BGL.
 288  */
 289 int
 290 speedup_syncer(void)
 291 {
 292         /*
 293          * Don't bother protecting the test.  unsleep_and_wakeup_thread()
 294          * will only do something real if the thread is in the right state.
 295          */
 296         wakeup(&lbolt_syncer);
 297         if (rushjob < syncdelay / 2) {
 298                 rushjob += 1;
 299                 stat_rush_requests += 1;
 300                 return (1);
 301         }
 302         return(0);
 303 }
 304
 305 /*
 306  * Routine to create and manage a filesystem syncer vnode.
 307  */
 308 static int sync_close(struct vop_close_args *);
 309 static int sync_fsync(struct vop_fsync_args *);
 310 static int sync_inactive(struct vop_inactive_args *);
 311 static int sync_reclaim (struct vop_reclaim_args *);
 312 static int sync_print(struct vop_print_args *);
 313
 314 static struct vop_ops sync_vnode_vops = {
 315         .vop_default =  vop_eopnotsupp,
 316         .vop_close =    sync_close,
 317         .vop_fsync =    sync_fsync,
 318         .vop_inactive = sync_inactive,
 319         .vop_reclaim =  sync_reclaim,
 320         .vop_print =    sync_print,
 321 };
 322
 323 static struct vop_ops *sync_vnode_vops_p = &sync_vnode_vops;
 324
 325 VNODEOP_SET(sync_vnode_vops);
 326
 327 /*
 328  * Create a new filesystem syncer vnode for the specified mount point.
 329  * This vnode is placed on the worklist and is responsible for sync'ing
 330  * the filesystem.
 331  *
 332  * NOTE: read-only mounts are also placed on the worklist.  The filesystem
 333  * sync code is also responsible for cleaning up vnodes.
 334  */
 335 int
 336 vfs_allocate_syncvnode(struct mount *mp)
 337 {
 338         struct vnode *vp;
 339         static long start, incr, next;
 340         int error;
 341
 342         /* Allocate a new vnode */
 343         error = getspecialvnode(VT_VFS, mp, &sync_vnode_vops_p, &vp, 0, 0);
 344         if (error) {
 345                 mp->mnt_syncer = NULL;
 346                 return (error);
 347         }
 348         vp->v_type = VNON;
 349         /*
 350          * Place the vnode onto the syncer worklist. We attempt to
 351          * scatter them about on the list so that they will go off
 352          * at evenly distributed times even if all the filesystems
 353          * are mounted at once.
 354          */
 355         next += incr;
 356         if (next == 0 || next > syncer_maxdelay) {
 357                 start /= 2;
 358                 incr /= 2;
 359                 if (start == 0) {
 360                         start = syncer_maxdelay / 2;
 361                         incr = syncer_maxdelay;
 362                 }
 363                 next = start;
 364         }
 365         vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 366
 367         /*
 368          * The mnt_syncer field inherits the vnode reference, which is
 369          * held until later decomissioning.
 370          */
 371         mp->mnt_syncer = vp;
 372         vx_unlock(vp);
 373         return (0);
 374 }
 375
 376 static int
 377 sync_close(struct vop_close_args *ap)
 378 {
 379         return (0);
 380 }
 381
 382 /*
 383  * Do a lazy sync of the filesystem.
 384  *
 385  * sync_fsync { struct vnode *a_vp, int a_waitfor }
 386  */
 387 static int
 388 sync_fsync(struct vop_fsync_args *ap)
 389 {
 390         struct vnode *syncvp = ap->a_vp;
 391         struct mount *mp = syncvp->v_mount;
 392         int asyncflag;
 393
 394         /*
 395          * We only need to do something if this is a lazy evaluation.
 396          */
 397         if (ap->a_waitfor != MNT_LAZY)
 398                 return (0);
 399
 400         /*
 401          * Move ourselves to the back of the sync list.
 402          */
 403         vn_syncer_add_to_worklist(syncvp, syncdelay);
 404
 405         /*
 406          * Walk the list of vnodes pushing all that are dirty and
 407          * not already on the sync list, and freeing vnodes which have
 408          * no refs and whos VM objects are empty.  vfs_msync() handles
 409          * the VM issues and must be called whether the mount is readonly
 410          * or not.
 411          */
 412         if (vfs_busy(mp, LK_NOWAIT) != 0)
 413                 return (0);
 414         if (mp->mnt_flag & MNT_RDONLY) {
 415                 vfs_msync(mp, MNT_NOWAIT);
 416         } else {
 417                 asyncflag = mp->mnt_flag & MNT_ASYNC;
 418                 mp->mnt_flag &= ~MNT_ASYNC;     /* ZZZ hack */
 419                 vfs_msync(mp, MNT_NOWAIT);
 420                 VFS_SYNC(mp, MNT_LAZY);
 421                 if (asyncflag)
 422                         mp->mnt_flag |= MNT_ASYNC;
 423         }
 424         vfs_unbusy(mp);
 425         return (0);
 426 }
 427
 428 /*
 429  * The syncer vnode is no longer referenced.
 430  *
 431  * sync_inactive { struct vnode *a_vp, struct proc *a_p }
 432  */
 433 static int
 434 sync_inactive(struct vop_inactive_args *ap)
 435 {
 436         vgone_vxlocked(ap->a_vp);
 437         return (0);
 438 }
 439
 440 /*
 441  * The syncer vnode is no longer needed and is being decommissioned.
 442  * This can only occur when the last reference has been released on
 443  * mp->mnt_syncer, so mp->mnt_syncer had better be NULL.
 444  *
 445  * Modifications to the worklist must be protected with a critical
 446  * section.
 447  *
 448  *      sync_reclaim { struct vnode *a_vp }
 449  */
 450 static int
 451 sync_reclaim(struct vop_reclaim_args *ap)
 452 {
 453         struct vnode *vp = ap->a_vp;
 454         lwkt_tokref ilock;
 455
 456         lwkt_gettoken(&ilock, &syncer_token);
 457         KKASSERT(vp->v_mount->mnt_syncer != vp);
 458         if (vp->v_flag & VONWORKLST) {
 459                 LIST_REMOVE(vp, v_synclist);
 460                 vclrflags(vp, VONWORKLST);
 461         }
 462         lwkt_reltoken(&ilock);
 463
 464         return (0);
 465 }
 466
 467 /*
 468  * Print out a syncer vnode.
 469  *
 470  *      sync_print { struct vnode *a_vp }
 471  */
 472 static int
 473 sync_print(struct vop_print_args *ap)
 474 {
 475         struct vnode *vp = ap->a_vp;
 476
 477         kprintf("syncer vnode");
 478         lockmgr_printinfo(&vp->v_lock);
 479         kprintf("\n");
 480         return (0);
 481 }
 482