release/src-rt-6.x.4708/linux/linux-2.6.36/fs/jbd/commit.c

   1 /*
   2  * linux/fs/jbd/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd.h>
  19 #include <linux/errno.h>
  20 #include <linux/mm.h>
  21 #include <linux/pagemap.h>
  22 #include <linux/bio.h>
  23
  24 /*
  25  * Default IO end handler for temporary BJ_IO buffer_heads.
  26  */
  27 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  28 {
  29         BUFFER_TRACE(bh, "");
  30         if (uptodate)
  31                 set_buffer_uptodate(bh);
  32         else
  33                 clear_buffer_uptodate(bh);
  34         unlock_buffer(bh);
  35 }
  36
  37 /*
  38  * When an ext3-ordered file is truncated, it is possible that many pages are
  39  * not successfully freed, because they are attached to a committing transaction.
  40  * After the transaction commits, these pages are left on the LRU, with no
  41  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  42  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  43  * the numbers in /proc/meminfo look odd.
  44  *
  45  * So here, we have a buffer which has just come off the forget list.  Look to
  46  * see if we can strip all buffers from the backing page.
  47  *
  48  * Called under journal->j_list_lock.  The caller provided us with a ref
  49  * against the buffer, and we drop that here.
  50  */
  51 static void release_buffer_page(struct buffer_head *bh)
  52 {
  53         struct page *page;
  54
  55         if (buffer_dirty(bh))
  56                 goto nope;
  57         if (atomic_read(&bh->b_count) != 1)
  58                 goto nope;
  59         page = bh->b_page;
  60         if (!page)
  61                 goto nope;
  62         if (page->mapping)
  63                 goto nope;
  64
  65         /* OK, it's a truncated page */
  66         if (!trylock_page(page))
  67                 goto nope;
  68
  69         page_cache_get(page);
  70         __brelse(bh);
  71         try_to_free_buffers(page);
  72         unlock_page(page);
  73         page_cache_release(page);
  74         return;
  75
  76 nope:
  77         __brelse(bh);
  78 }
  79
  80 /*
  81  * Decrement reference counter for data buffer. If it has been marked
  82  * 'BH_Freed', release it and the page to which it belongs if possible.
  83  */
  84 static void release_data_buffer(struct buffer_head *bh)
  85 {
  86         if (buffer_freed(bh)) {
  87                 clear_buffer_freed(bh);
  88                 release_buffer_page(bh);
  89         } else
  90                 put_bh(bh);
  91 }
  92
  93 /*
  94  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  95  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  96  * return 0.  j_list_lock is dropped in this case.
  97  */
  98 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  99 {
 100         if (!jbd_trylock_bh_state(bh)) {
 101                 spin_unlock(&journal->j_list_lock);
 102                 schedule();
 103                 return 0;
 104         }
 105         return 1;
 106 }
 107
 108 /* Done it all: now write the commit record.  We should have
 109  * cleaned up our previous buffers by now, so if we are in abort
 110  * mode we can now just skip the rest of the journal write
 111  * entirely.
 112  *
 113  * Returns 1 if the journal needs to be aborted or 0 on success
 114  */
 115 static int journal_write_commit_record(journal_t *journal,
 116                                         transaction_t *commit_transaction)
 117 {
 118         struct journal_head *descriptor;
 119         struct buffer_head *bh;
 120         journal_header_t *header;
 121         int ret;
 122
 123         if (is_journal_aborted(journal))
 124                 return 0;
 125
 126         descriptor = journal_get_descriptor_buffer(journal);
 127         if (!descriptor)
 128                 return 1;
 129
 130         bh = jh2bh(descriptor);
 131
 132         header = (journal_header_t *)(bh->b_data);
 133         header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 134         header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 135         header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 136
 137         JBUFFER_TRACE(descriptor, "write commit block");
 138         set_buffer_dirty(bh);
 139
 140         if (journal->j_flags & JFS_BARRIER) {
 141                 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
 142
 143                 /*
 144                  * Is it possible for another commit to fail at roughly
 145                  * the same time as this one?  If so, we don't want to
 146                  * trust the barrier flag in the super, but instead want
 147                  * to remember if we sent a barrier request
 148                  */
 149                 if (ret == -EOPNOTSUPP) {
 150                         char b[BDEVNAME_SIZE];
 151
 152                         printk(KERN_WARNING
 153                                 "JBD: barrier-based sync failed on %s - "
 154                                 "disabling barriers\n",
 155                                 bdevname(journal->j_dev, b));
 156                         spin_lock(&journal->j_state_lock);
 157                         journal->j_flags &= ~JFS_BARRIER;
 158                         spin_unlock(&journal->j_state_lock);
 159
 160                         /* And try again, without the barrier */
 161                         set_buffer_uptodate(bh);
 162                         set_buffer_dirty(bh);
 163                         ret = sync_dirty_buffer(bh);
 164                 }
 165         } else {
 166                 ret = sync_dirty_buffer(bh);
 167         }
 168
 169         put_bh(bh);             /* One for getblk() */
 170         journal_put_journal_head(descriptor);
 171
 172         return (ret == -EIO);
 173 }
 174
 175 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 176                                    int write_op)
 177 {
 178         int i;
 179
 180         for (i = 0; i < bufs; i++) {
 181                 wbuf[i]->b_end_io = end_buffer_write_sync;
 182                 /* We use-up our safety reference in submit_bh() */
 183                 submit_bh(write_op, wbuf[i]);
 184         }
 185 }
 186
 187 /*
 188  *  Submit all the data buffers to disk
 189  */
 190 static int journal_submit_data_buffers(journal_t *journal,
 191                                        transaction_t *commit_transaction,
 192                                        int write_op)
 193 {
 194         struct journal_head *jh;
 195         struct buffer_head *bh;
 196         int locked;
 197         int bufs = 0;
 198         struct buffer_head **wbuf = journal->j_wbuf;
 199         int err = 0;
 200
 201         /*
 202          * Whenever we unlock the journal and sleep, things can get added
 203          * onto ->t_sync_datalist, so we have to keep looping back to
 204          * write_out_data until we *know* that the list is empty.
 205          *
 206          * Cleanup any flushed data buffers from the data list.  Even in
 207          * abort mode, we want to flush this out as soon as possible.
 208          */
 209 write_out_data:
 210         cond_resched();
 211         spin_lock(&journal->j_list_lock);
 212
 213         while (commit_transaction->t_sync_datalist) {
 214                 jh = commit_transaction->t_sync_datalist;
 215                 bh = jh2bh(jh);
 216                 locked = 0;
 217
 218                 /* Get reference just to make sure buffer does not disappear
 219                  * when we are forced to drop various locks */
 220                 get_bh(bh);
 221                 /* If the buffer is dirty, we need to submit IO and hence
 222                  * we need the buffer lock. We try to lock the buffer without
 223                  * blocking. If we fail, we need to drop j_list_lock and do
 224                  * blocking lock_buffer().
 225                  */
 226                 if (buffer_dirty(bh)) {
 227                         if (!trylock_buffer(bh)) {
 228                                 BUFFER_TRACE(bh, "needs blocking lock");
 229                                 spin_unlock(&journal->j_list_lock);
 230                                 /* Write out all data to prevent deadlocks */
 231                                 journal_do_submit_data(wbuf, bufs, write_op);
 232                                 bufs = 0;
 233                                 lock_buffer(bh);
 234                                 spin_lock(&journal->j_list_lock);
 235                         }
 236                         locked = 1;
 237                 }
 238                 /* We have to get bh_state lock. Again out of order, sigh. */
 239                 if (!inverted_lock(journal, bh)) {
 240                         jbd_lock_bh_state(bh);
 241                         spin_lock(&journal->j_list_lock);
 242                 }
 243                 /* Someone already cleaned up the buffer? */
 244                 if (!buffer_jbd(bh) || bh2jh(bh) != jh
 245                         || jh->b_transaction != commit_transaction
 246                         || jh->b_jlist != BJ_SyncData) {
 247                         jbd_unlock_bh_state(bh);
 248                         if (locked)
 249                                 unlock_buffer(bh);
 250                         BUFFER_TRACE(bh, "already cleaned up");
 251                         release_data_buffer(bh);
 252                         continue;
 253                 }
 254                 if (locked && test_clear_buffer_dirty(bh)) {
 255                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 256                         wbuf[bufs++] = bh;
 257                         __journal_file_buffer(jh, commit_transaction,
 258                                                 BJ_Locked);
 259                         jbd_unlock_bh_state(bh);
 260                         if (bufs == journal->j_wbufsize) {
 261                                 spin_unlock(&journal->j_list_lock);
 262                                 journal_do_submit_data(wbuf, bufs, write_op);
 263                                 bufs = 0;
 264                                 goto write_out_data;
 265                         }
 266                 } else if (!locked && buffer_locked(bh)) {
 267                         __journal_file_buffer(jh, commit_transaction,
 268                                                 BJ_Locked);
 269                         jbd_unlock_bh_state(bh);
 270                         put_bh(bh);
 271                 } else {
 272                         BUFFER_TRACE(bh, "writeout complete: unfile");
 273                         if (unlikely(!buffer_uptodate(bh)))
 274                                 err = -EIO;
 275                         __journal_unfile_buffer(jh);
 276                         jbd_unlock_bh_state(bh);
 277                         if (locked)
 278                                 unlock_buffer(bh);
 279                         journal_remove_journal_head(bh);
 280                         /* One for our safety reference, other for
 281                          * journal_remove_journal_head() */
 282                         put_bh(bh);
 283                         release_data_buffer(bh);
 284                 }
 285
 286                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 287                         spin_unlock(&journal->j_list_lock);
 288                         goto write_out_data;
 289                 }
 290         }
 291         spin_unlock(&journal->j_list_lock);
 292         journal_do_submit_data(wbuf, bufs, write_op);
 293
 294         return err;
 295 }
 296
 297 /*
 298  * journal_commit_transaction
 299  *
 300  * The primary function for committing a transaction to the log.  This
 301  * function is called by the journal thread to begin a complete commit.
 302  */
 303 void journal_commit_transaction(journal_t *journal)
 304 {
 305         transaction_t *commit_transaction;
 306         struct journal_head *jh, *new_jh, *descriptor;
 307         struct buffer_head **wbuf = journal->j_wbuf;
 308         int bufs;
 309         int flags;
 310         int err;
 311         unsigned int blocknr;
 312         ktime_t start_time;
 313         u64 commit_time;
 314         char *tagp = NULL;
 315         journal_header_t *header;
 316         journal_block_tag_t *tag = NULL;
 317         int space_left = 0;
 318         int first_tag = 0;
 319         int tag_flag;
 320         int i;
 321         int write_op = WRITE;
 322
 323         /*
 324          * First job: lock down the current transaction and wait for
 325          * all outstanding updates to complete.
 326          */
 327
 328 #ifdef COMMIT_STATS
 329         spin_lock(&journal->j_list_lock);
 330         summarise_journal_usage(journal);
 331         spin_unlock(&journal->j_list_lock);
 332 #endif
 333
 334         /* Do we need to erase the effects of a prior journal_flush? */
 335         if (journal->j_flags & JFS_FLUSHED) {
 336                 jbd_debug(3, "super block updated\n");
 337                 journal_update_superblock(journal, 1);
 338         } else {
 339                 jbd_debug(3, "superblock not updated\n");
 340         }
 341
 342         J_ASSERT(journal->j_running_transaction != NULL);
 343         J_ASSERT(journal->j_committing_transaction == NULL);
 344
 345         commit_transaction = journal->j_running_transaction;
 346         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 347
 348         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 349                         commit_transaction->t_tid);
 350
 351         spin_lock(&journal->j_state_lock);
 352         commit_transaction->t_state = T_LOCKED;
 353
 354         /*
 355          * Use plugged writes here, since we want to submit several before
 356          * we unplug the device. We don't do explicit unplugging in here,
 357          * instead we rely on sync_buffer() doing the unplug for us.
 358          */
 359         if (commit_transaction->t_synchronous_commit)
 360                 write_op = WRITE_SYNC_PLUG;
 361         spin_lock(&commit_transaction->t_handle_lock);
 362         while (commit_transaction->t_updates) {
 363                 DEFINE_WAIT(wait);
 364
 365                 prepare_to_wait(&journal->j_wait_updates, &wait,
 366                                         TASK_UNINTERRUPTIBLE);
 367                 if (commit_transaction->t_updates) {
 368                         spin_unlock(&commit_transaction->t_handle_lock);
 369                         spin_unlock(&journal->j_state_lock);
 370                         schedule();
 371                         spin_lock(&journal->j_state_lock);
 372                         spin_lock(&commit_transaction->t_handle_lock);
 373                 }
 374                 finish_wait(&journal->j_wait_updates, &wait);
 375         }
 376         spin_unlock(&commit_transaction->t_handle_lock);
 377
 378         J_ASSERT (commit_transaction->t_outstanding_credits <=
 379                         journal->j_max_transaction_buffers);
 380
 381         /*
 382          * First thing we are allowed to do is to discard any remaining
 383          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 384          * that there are no such buffers: if a large filesystem
 385          * operation like a truncate needs to split itself over multiple
 386          * transactions, then it may try to do a journal_restart() while
 387          * there are still BJ_Reserved buffers outstanding.  These must
 388          * be released cleanly from the current transaction.
 389          *
 390          * In this case, the filesystem must still reserve write access
 391          * again before modifying the buffer in the new transaction, but
 392          * we do not require it to remember exactly which old buffers it
 393          * has reserved.  This is consistent with the existing behaviour
 394          * that multiple journal_get_write_access() calls to the same
 395          * buffer are perfectly permissable.
 396          */
 397         while (commit_transaction->t_reserved_list) {
 398                 jh = commit_transaction->t_reserved_list;
 399                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 400                 /*
 401                  * A journal_get_undo_access()+journal_release_buffer() may
 402                  * leave undo-committed data.
 403                  */
 404                 if (jh->b_committed_data) {
 405                         struct buffer_head *bh = jh2bh(jh);
 406
 407                         jbd_lock_bh_state(bh);
 408                         jbd_free(jh->b_committed_data, bh->b_size);
 409                         jh->b_committed_data = NULL;
 410                         jbd_unlock_bh_state(bh);
 411                 }
 412                 journal_refile_buffer(journal, jh);
 413         }
 414
 415         /*
 416          * Now try to drop any written-back buffers from the journal's
 417          * checkpoint lists.  We do this *before* commit because it potentially
 418          * frees some memory
 419          */
 420         spin_lock(&journal->j_list_lock);
 421         __journal_clean_checkpoint_list(journal);
 422         spin_unlock(&journal->j_list_lock);
 423
 424         jbd_debug (3, "JBD: commit phase 1\n");
 425
 426         /*
 427          * Switch to a new revoke table.
 428          */
 429         journal_switch_revoke_table(journal);
 430
 431         commit_transaction->t_state = T_FLUSH;
 432         journal->j_committing_transaction = commit_transaction;
 433         journal->j_running_transaction = NULL;
 434         start_time = ktime_get();
 435         commit_transaction->t_log_start = journal->j_head;
 436         wake_up(&journal->j_wait_transaction_locked);
 437         spin_unlock(&journal->j_state_lock);
 438
 439         jbd_debug (3, "JBD: commit phase 2\n");
 440
 441         /*
 442          * Now start flushing things to disk, in the order they appear
 443          * on the transaction lists.  Data blocks go first.
 444          */
 445         err = journal_submit_data_buffers(journal, commit_transaction,
 446                                           write_op);
 447
 448         /*
 449          * Wait for all previously submitted IO to complete.
 450          */
 451         spin_lock(&journal->j_list_lock);
 452         while (commit_transaction->t_locked_list) {
 453                 struct buffer_head *bh;
 454
 455                 jh = commit_transaction->t_locked_list->b_tprev;
 456                 bh = jh2bh(jh);
 457                 get_bh(bh);
 458                 if (buffer_locked(bh)) {
 459                         spin_unlock(&journal->j_list_lock);
 460                         wait_on_buffer(bh);
 461                         spin_lock(&journal->j_list_lock);
 462                 }
 463                 if (unlikely(!buffer_uptodate(bh))) {
 464                         if (!trylock_page(bh->b_page)) {
 465                                 spin_unlock(&journal->j_list_lock);
 466                                 lock_page(bh->b_page);
 467                                 spin_lock(&journal->j_list_lock);
 468                         }
 469                         if (bh->b_page->mapping)
 470                                 set_bit(AS_EIO, &bh->b_page->mapping->flags);
 471
 472                         unlock_page(bh->b_page);
 473                         SetPageError(bh->b_page);
 474                         err = -EIO;
 475                 }
 476                 if (!inverted_lock(journal, bh)) {
 477                         put_bh(bh);
 478                         spin_lock(&journal->j_list_lock);
 479                         continue;
 480                 }
 481                 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
 482                     jh->b_transaction == commit_transaction &&
 483                     jh->b_jlist == BJ_Locked) {
 484                         __journal_unfile_buffer(jh);
 485                         jbd_unlock_bh_state(bh);
 486                         journal_remove_journal_head(bh);
 487                         put_bh(bh);
 488                 } else {
 489                         jbd_unlock_bh_state(bh);
 490                 }
 491                 release_data_buffer(bh);
 492                 cond_resched_lock(&journal->j_list_lock);
 493         }
 494         spin_unlock(&journal->j_list_lock);
 495
 496         if (err) {
 497                 char b[BDEVNAME_SIZE];
 498
 499                 printk(KERN_WARNING
 500                         "JBD: Detected IO errors while flushing file data "
 501                         "on %s\n", bdevname(journal->j_fs_dev, b));
 502                 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
 503                         journal_abort(journal, err);
 504                 err = 0;
 505         }
 506
 507         journal_write_revoke_records(journal, commit_transaction, write_op);
 508
 509         /*
 510          * If we found any dirty or locked buffers, then we should have
 511          * looped back up to the write_out_data label.  If there weren't
 512          * any then journal_clean_data_list should have wiped the list
 513          * clean by now, so check that it is in fact empty.
 514          */
 515         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 516
 517         jbd_debug (3, "JBD: commit phase 3\n");
 518
 519         /*
 520          * Way to go: we have now written out all of the data for a
 521          * transaction!  Now comes the tricky part: we need to write out
 522          * metadata.  Loop over the transaction's entire buffer list:
 523          */
 524         spin_lock(&journal->j_state_lock);
 525         commit_transaction->t_state = T_COMMIT;
 526         spin_unlock(&journal->j_state_lock);
 527
 528         J_ASSERT(commit_transaction->t_nr_buffers <=
 529                  commit_transaction->t_outstanding_credits);
 530
 531         descriptor = NULL;
 532         bufs = 0;
 533         while (commit_transaction->t_buffers) {
 534
 535                 /* Find the next buffer to be journaled... */
 536
 537                 jh = commit_transaction->t_buffers;
 538
 539                 /* If we're in abort mode, we just un-journal the buffer and
 540                    release it. */
 541
 542                 if (is_journal_aborted(journal)) {
 543                         clear_buffer_jbddirty(jh2bh(jh));
 544                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 545                         journal_refile_buffer(journal, jh);
 546                         /* If that was the last one, we need to clean up
 547                          * any descriptor buffers which may have been
 548                          * already allocated, even if we are now
 549                          * aborting. */
 550                         if (!commit_transaction->t_buffers)
 551                                 goto start_journal_io;
 552                         continue;
 553                 }
 554
 555                 /* Make sure we have a descriptor block in which to
 556                    record the metadata buffer. */
 557
 558                 if (!descriptor) {
 559                         struct buffer_head *bh;
 560
 561                         J_ASSERT (bufs == 0);
 562
 563                         jbd_debug(4, "JBD: get descriptor\n");
 564
 565                         descriptor = journal_get_descriptor_buffer(journal);
 566                         if (!descriptor) {
 567                                 journal_abort(journal, -EIO);
 568                                 continue;
 569                         }
 570
 571                         bh = jh2bh(descriptor);
 572                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 573                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 574                         header = (journal_header_t *)&bh->b_data[0];
 575                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 576                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 577                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 578
 579                         tagp = &bh->b_data[sizeof(journal_header_t)];
 580                         space_left = bh->b_size - sizeof(journal_header_t);
 581                         first_tag = 1;
 582                         set_buffer_jwrite(bh);
 583                         set_buffer_dirty(bh);
 584                         wbuf[bufs++] = bh;
 585
 586                         /* Record it so that we can wait for IO
 587                            completion later */
 588                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 589                         journal_file_buffer(descriptor, commit_transaction,
 590                                         BJ_LogCtl);
 591                 }
 592
 593                 /* Where is the buffer to be written? */
 594
 595                 err = journal_next_log_block(journal, &blocknr);
 596                 /* If the block mapping failed, just abandon the buffer
 597                    and repeat this loop: we'll fall into the
 598                    refile-on-abort condition above. */
 599                 if (err) {
 600                         journal_abort(journal, err);
 601                         continue;
 602                 }
 603
 604                 /*
 605                  * start_this_handle() uses t_outstanding_credits to determine
 606                  * the free space in the log, but this counter is changed
 607                  * by journal_next_log_block() also.
 608                  */
 609                 commit_transaction->t_outstanding_credits--;
 610
 611                 /* Bump b_count to prevent truncate from stumbling over
 612                    the shadowed buffer!  @@@ This can go if we ever get
 613                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 614                 atomic_inc(&jh2bh(jh)->b_count);
 615
 616                 /* Make a temporary IO buffer with which to write it out
 617                    (this will requeue both the metadata buffer and the
 618                    temporary IO buffer). new_bh goes on BJ_IO*/
 619
 620                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 621                 /*
 622                  * akpm: journal_write_metadata_buffer() sets
 623                  * new_bh->b_transaction to commit_transaction.
 624                  * We need to clean this up before we release new_bh
 625                  * (which is of type BJ_IO)
 626                  */
 627                 JBUFFER_TRACE(jh, "ph3: write metadata");
 628                 flags = journal_write_metadata_buffer(commit_transaction,
 629                                                       jh, &new_jh, blocknr);
 630                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 631                 wbuf[bufs++] = jh2bh(new_jh);
 632
 633                 /* Record the new block's tag in the current descriptor
 634                    buffer */
 635
 636                 tag_flag = 0;
 637                 if (flags & 1)
 638                         tag_flag |= JFS_FLAG_ESCAPE;
 639                 if (!first_tag)
 640                         tag_flag |= JFS_FLAG_SAME_UUID;
 641
 642                 tag = (journal_block_tag_t *) tagp;
 643                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 644                 tag->t_flags = cpu_to_be32(tag_flag);
 645                 tagp += sizeof(journal_block_tag_t);
 646                 space_left -= sizeof(journal_block_tag_t);
 647
 648                 if (first_tag) {
 649                         memcpy (tagp, journal->j_uuid, 16);
 650                         tagp += 16;
 651                         space_left -= 16;
 652                         first_tag = 0;
 653                 }
 654
 655                 /* If there's no more to do, or if the descriptor is full,
 656                    let the IO rip! */
 657
 658                 if (bufs == journal->j_wbufsize ||
 659                     commit_transaction->t_buffers == NULL ||
 660                     space_left < sizeof(journal_block_tag_t) + 16) {
 661
 662                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 663
 664                         /* Write an end-of-descriptor marker before
 665                            submitting the IOs.  "tag" still points to
 666                            the last tag we set up. */
 667
 668                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 669
 670 start_journal_io:
 671                         for (i = 0; i < bufs; i++) {
 672                                 struct buffer_head *bh = wbuf[i];
 673                                 lock_buffer(bh);
 674                                 clear_buffer_dirty(bh);
 675                                 set_buffer_uptodate(bh);
 676                                 bh->b_end_io = journal_end_buffer_io_sync;
 677                                 submit_bh(write_op, bh);
 678                         }
 679                         cond_resched();
 680
 681                         /* Force a new descriptor to be generated next
 682                            time round the loop. */
 683                         descriptor = NULL;
 684                         bufs = 0;
 685                 }
 686         }
 687
 688         /* Lo and behold: we have just managed to send a transaction to
 689            the log.  Before we can commit it, wait for the IO so far to
 690            complete.  Control buffers being written are on the
 691            transaction's t_log_list queue, and metadata buffers are on
 692            the t_iobuf_list queue.
 693
 694            Wait for the buffers in reverse order.  That way we are
 695            less likely to be woken up until all IOs have completed, and
 696            so we incur less scheduling load.
 697         */
 698
 699         jbd_debug(3, "JBD: commit phase 4\n");
 700
 701         /*
 702          * akpm: these are BJ_IO, and j_list_lock is not needed.
 703          * See __journal_try_to_free_buffer.
 704          */
 705 wait_for_iobuf:
 706         while (commit_transaction->t_iobuf_list != NULL) {
 707                 struct buffer_head *bh;
 708
 709                 jh = commit_transaction->t_iobuf_list->b_tprev;
 710                 bh = jh2bh(jh);
 711                 if (buffer_locked(bh)) {
 712                         wait_on_buffer(bh);
 713                         goto wait_for_iobuf;
 714                 }
 715                 if (cond_resched())
 716                         goto wait_for_iobuf;
 717
 718                 if (unlikely(!buffer_uptodate(bh)))
 719                         err = -EIO;
 720
 721                 clear_buffer_jwrite(bh);
 722
 723                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 724                 journal_unfile_buffer(journal, jh);
 725
 726                 /*
 727                  * ->t_iobuf_list should contain only dummy buffer_heads
 728                  * which were created by journal_write_metadata_buffer().
 729                  */
 730                 BUFFER_TRACE(bh, "dumping temporary bh");
 731                 journal_put_journal_head(jh);
 732                 __brelse(bh);
 733                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 734                 free_buffer_head(bh);
 735
 736                 /* We also have to unlock and free the corresponding
 737                    shadowed buffer */
 738                 jh = commit_transaction->t_shadow_list->b_tprev;
 739                 bh = jh2bh(jh);
 740                 clear_bit(BH_JWrite, &bh->b_state);
 741                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 742
 743                 /* The metadata is now released for reuse, but we need
 744                    to remember it against this transaction so that when
 745                    we finally commit, we can do any checkpointing
 746                    required. */
 747                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 748                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
 749                 /* Wake up any transactions which were waiting for this
 750                    IO to complete */
 751                 wake_up_bit(&bh->b_state, BH_Unshadow);
 752                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 753                 __brelse(bh);
 754         }
 755
 756         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 757
 758         jbd_debug(3, "JBD: commit phase 5\n");
 759
 760         /* Here we wait for the revoke record and descriptor record buffers */
 761  wait_for_ctlbuf:
 762         while (commit_transaction->t_log_list != NULL) {
 763                 struct buffer_head *bh;
 764
 765                 jh = commit_transaction->t_log_list->b_tprev;
 766                 bh = jh2bh(jh);
 767                 if (buffer_locked(bh)) {
 768                         wait_on_buffer(bh);
 769                         goto wait_for_ctlbuf;
 770                 }
 771                 if (cond_resched())
 772                         goto wait_for_ctlbuf;
 773
 774                 if (unlikely(!buffer_uptodate(bh)))
 775                         err = -EIO;
 776
 777                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 778                 clear_buffer_jwrite(bh);
 779                 journal_unfile_buffer(journal, jh);
 780                 journal_put_journal_head(jh);
 781                 __brelse(bh);           /* One for getblk */
 782                 /* AKPM: bforget here */
 783         }
 784
 785         if (err)
 786                 journal_abort(journal, err);
 787
 788         jbd_debug(3, "JBD: commit phase 6\n");
 789
 790         /* All metadata is written, now write commit record and do cleanup */
 791         spin_lock(&journal->j_state_lock);
 792         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 793         commit_transaction->t_state = T_COMMIT_RECORD;
 794         spin_unlock(&journal->j_state_lock);
 795
 796         if (journal_write_commit_record(journal, commit_transaction))
 797                 err = -EIO;
 798
 799         if (err)
 800                 journal_abort(journal, err);
 801
 802         /* End of a transaction!  Finally, we can do checkpoint
 803            processing: any buffers committed as a result of this
 804            transaction can be removed from any checkpoint list it was on
 805            before. */
 806
 807         jbd_debug(3, "JBD: commit phase 7\n");
 808
 809         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 810         J_ASSERT(commit_transaction->t_buffers == NULL);
 811         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 812         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 813         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 814         J_ASSERT(commit_transaction->t_log_list == NULL);
 815
 816 restart_loop:
 817         /*
 818          * As there are other places (journal_unmap_buffer()) adding buffers
 819          * to this list we have to be careful and hold the j_list_lock.
 820          */
 821         spin_lock(&journal->j_list_lock);
 822         while (commit_transaction->t_forget) {
 823                 transaction_t *cp_transaction;
 824                 struct buffer_head *bh;
 825
 826                 jh = commit_transaction->t_forget;
 827                 spin_unlock(&journal->j_list_lock);
 828                 bh = jh2bh(jh);
 829                 jbd_lock_bh_state(bh);
 830                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 831                         jh->b_transaction == journal->j_running_transaction);
 832
 833                 /*
 834                  * If there is undo-protected committed data against
 835                  * this buffer, then we can remove it now.  If it is a
 836                  * buffer needing such protection, the old frozen_data
 837                  * field now points to a committed version of the
 838                  * buffer, so rotate that field to the new committed
 839                  * data.
 840                  *
 841                  * Otherwise, we can just throw away the frozen data now.
 842                  */
 843                 if (jh->b_committed_data) {
 844                         jbd_free(jh->b_committed_data, bh->b_size);
 845                         jh->b_committed_data = NULL;
 846                         if (jh->b_frozen_data) {
 847                                 jh->b_committed_data = jh->b_frozen_data;
 848                                 jh->b_frozen_data = NULL;
 849                         }
 850                 } else if (jh->b_frozen_data) {
 851                         jbd_free(jh->b_frozen_data, bh->b_size);
 852                         jh->b_frozen_data = NULL;
 853                 }
 854
 855                 spin_lock(&journal->j_list_lock);
 856                 cp_transaction = jh->b_cp_transaction;
 857                 if (cp_transaction) {
 858                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 859                         __journal_remove_checkpoint(jh);
 860                 }
 861
 862                 /* Only re-checkpoint the buffer_head if it is marked
 863                  * dirty.  If the buffer was added to the BJ_Forget list
 864                  * by journal_forget, it may no longer be dirty and
 865                  * there's no point in keeping a checkpoint record for
 866                  * it. */
 867
 868                 /* A buffer which has been freed while still being
 869                  * journaled by a previous transaction may end up still
 870                  * being dirty here, but we want to avoid writing back
 871                  * that buffer in the future after the "add to orphan"
 872                  * operation been committed,  That's not only a performance
 873                  * gain, it also stops aliasing problems if the buffer is
 874                  * left behind for writeback and gets reallocated for another
 875                  * use in a different page. */
 876                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 877                         clear_buffer_freed(bh);
 878                         clear_buffer_jbddirty(bh);
 879                 }
 880
 881                 if (buffer_jbddirty(bh)) {
 882                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 883                         __journal_insert_checkpoint(jh, commit_transaction);
 884                         if (is_journal_aborted(journal))
 885                                 clear_buffer_jbddirty(bh);
 886                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 887                         __journal_refile_buffer(jh);
 888                         jbd_unlock_bh_state(bh);
 889                 } else {
 890                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 891                         /* The buffer on BJ_Forget list and not jbddirty means
 892                          * it has been freed by this transaction and hence it
 893                          * could not have been reallocated until this
 894                          * transaction has committed. *BUT* it could be
 895                          * reallocated once we have written all the data to
 896                          * disk and before we process the buffer on BJ_Forget
 897                          * list. */
 898                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 899                         __journal_refile_buffer(jh);
 900                         if (!jh->b_transaction) {
 901                                 jbd_unlock_bh_state(bh);
 902                                  /* needs a brelse */
 903                                 journal_remove_journal_head(bh);
 904                                 release_buffer_page(bh);
 905                         } else
 906                                 jbd_unlock_bh_state(bh);
 907                 }
 908                 cond_resched_lock(&journal->j_list_lock);
 909         }
 910         spin_unlock(&journal->j_list_lock);
 911         /*
 912          * This is a bit sleazy.  We use j_list_lock to protect transition
 913          * of a transaction into T_FINISHED state and calling
 914          * __journal_drop_transaction(). Otherwise we could race with
 915          * other checkpointing code processing the transaction...
 916          */
 917         spin_lock(&journal->j_state_lock);
 918         spin_lock(&journal->j_list_lock);
 919         /*
 920          * Now recheck if some buffers did not get attached to the transaction
 921          * while the lock was dropped...
 922          */
 923         if (commit_transaction->t_forget) {
 924                 spin_unlock(&journal->j_list_lock);
 925                 spin_unlock(&journal->j_state_lock);
 926                 goto restart_loop;
 927         }
 928
 929         /* Done with this transaction! */
 930
 931         jbd_debug(3, "JBD: commit phase 8\n");
 932
 933         J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
 934
 935         commit_transaction->t_state = T_FINISHED;
 936         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 937         journal->j_commit_sequence = commit_transaction->t_tid;
 938         journal->j_committing_transaction = NULL;
 939         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 940
 941         /*
 942          * weight the commit time higher than the average time so we don't
 943          * react too strongly to vast changes in commit time
 944          */
 945         if (likely(journal->j_average_commit_time))
 946                 journal->j_average_commit_time = (commit_time*3 +
 947                                 journal->j_average_commit_time) / 4;
 948         else
 949                 journal->j_average_commit_time = commit_time;
 950
 951         spin_unlock(&journal->j_state_lock);
 952
 953         if (commit_transaction->t_checkpoint_list == NULL &&
 954             commit_transaction->t_checkpoint_io_list == NULL) {
 955                 __journal_drop_transaction(journal, commit_transaction);
 956         } else {
 957                 if (journal->j_checkpoint_transactions == NULL) {
 958                         journal->j_checkpoint_transactions = commit_transaction;
 959                         commit_transaction->t_cpnext = commit_transaction;
 960                         commit_transaction->t_cpprev = commit_transaction;
 961                 } else {
 962                         commit_transaction->t_cpnext =
 963                                 journal->j_checkpoint_transactions;
 964                         commit_transaction->t_cpprev =
 965                                 commit_transaction->t_cpnext->t_cpprev;
 966                         commit_transaction->t_cpnext->t_cpprev =
 967                                 commit_transaction;
 968                         commit_transaction->t_cpprev->t_cpnext =
 969                                 commit_transaction;
 970                 }
 971         }
 972         spin_unlock(&journal->j_list_lock);
 973
 974         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 975                   journal->j_commit_sequence, journal->j_tail_sequence);
 976
 977         wake_up(&journal->j_wait_done_commit);
 978 }