fs/jbd/commit.c

   1 /*
   2  * linux/fs/jbd/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/bio.h>
  24
  25 /*
  26  * Default IO end handler for temporary BJ_IO buffer_heads.
  27  */
  28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  29 {
  30         BUFFER_TRACE(bh, "");
  31         if (uptodate)
  32                 set_buffer_uptodate(bh);
  33         else
  34                 clear_buffer_uptodate(bh);
  35         unlock_buffer(bh);
  36 }
  37
  38 /*
  39  * When an ext3-ordered file is truncated, it is possible that many pages are
  40  * not successfully freed, because they are attached to a committing transaction.
  41  * After the transaction commits, these pages are left on the LRU, with no
  42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  44  * the numbers in /proc/meminfo look odd.
  45  *
  46  * So here, we have a buffer which has just come off the forget list.  Look to
  47  * see if we can strip all buffers from the backing page.
  48  *
  49  * Called under journal->j_list_lock.  The caller provided us with a ref
  50  * against the buffer, and we drop that here.
  51  */
  52 static void release_buffer_page(struct buffer_head *bh)
  53 {
  54         struct page *page;
  55
  56         if (buffer_dirty(bh))
  57                 goto nope;
  58         if (atomic_read(&bh->b_count) != 1)
  59                 goto nope;
  60         page = bh->b_page;
  61         if (!page)
  62                 goto nope;
  63         if (page->mapping)
  64                 goto nope;
  65
  66         /* OK, it's a truncated page */
  67         if (!trylock_page(page))
  68                 goto nope;
  69
  70         page_cache_get(page);
  71         __brelse(bh);
  72         try_to_free_buffers(page);
  73         unlock_page(page);
  74         page_cache_release(page);
  75         return;
  76
  77 nope:
  78         __brelse(bh);
  79 }
  80
  81 /*
  82  * Decrement reference counter for data buffer. If it has been marked
  83  * 'BH_Freed', release it and the page to which it belongs if possible.
  84  */
  85 static void release_data_buffer(struct buffer_head *bh)
  86 {
  87         if (buffer_freed(bh)) {
  88                 clear_buffer_freed(bh);
  89                 release_buffer_page(bh);
  90         } else
  91                 put_bh(bh);
  92 }
  93
  94 /*
  95  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  96  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  97  * return 0.  j_list_lock is dropped in this case.
  98  */
  99 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
 100 {
 101         if (!jbd_trylock_bh_state(bh)) {
 102                 spin_unlock(&journal->j_list_lock);
 103                 schedule();
 104                 return 0;
 105         }
 106         return 1;
 107 }
 108
 109 /* Done it all: now write the commit record.  We should have
 110  * cleaned up our previous buffers by now, so if we are in abort
 111  * mode we can now just skip the rest of the journal write
 112  * entirely.
 113  *
 114  * Returns 1 if the journal needs to be aborted or 0 on success
 115  */
 116 static int journal_write_commit_record(journal_t *journal,
 117                                         transaction_t *commit_transaction)
 118 {
 119         struct journal_head *descriptor;
 120         struct buffer_head *bh;
 121         journal_header_t *header;
 122         int ret;
 123         int barrier_done = 0;
 124
 125         if (is_journal_aborted(journal))
 126                 return 0;
 127
 128         descriptor = journal_get_descriptor_buffer(journal);
 129         if (!descriptor)
 130                 return 1;
 131
 132         bh = jh2bh(descriptor);
 133
 134         header = (journal_header_t *)(bh->b_data);
 135         header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
 136         header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
 137         header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 138
 139         JBUFFER_TRACE(descriptor, "write commit block");
 140         set_buffer_dirty(bh);
 141         if (journal->j_flags & JFS_BARRIER) {
 142                 set_buffer_ordered(bh);
 143                 barrier_done = 1;
 144         }
 145         ret = sync_dirty_buffer(bh);
 146         if (barrier_done)
 147                 clear_buffer_ordered(bh);
 148         /* is it possible for another commit to fail at roughly
 149          * the same time as this one?  If so, we don't want to
 150          * trust the barrier flag in the super, but instead want
 151          * to remember if we sent a barrier request
 152          */
 153         if (ret == -EOPNOTSUPP && barrier_done) {
 154                 char b[BDEVNAME_SIZE];
 155
 156                 printk(KERN_WARNING
 157                         "JBD: barrier-based sync failed on %s - "
 158                         "disabling barriers\n",
 159                         bdevname(journal->j_dev, b));
 160                 spin_lock(&journal->j_state_lock);
 161                 journal->j_flags &= ~JFS_BARRIER;
 162                 spin_unlock(&journal->j_state_lock);
 163
 164                 /* And try again, without the barrier */
 165                 set_buffer_uptodate(bh);
 166                 set_buffer_dirty(bh);
 167                 ret = sync_dirty_buffer(bh);
 168         }
 169         put_bh(bh);             /* One for getblk() */
 170         journal_put_journal_head(descriptor);
 171
 172         return (ret == -EIO);
 173 }
 174
 175 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 176                                    int write_op)
 177 {
 178         int i;
 179
 180         for (i = 0; i < bufs; i++) {
 181                 wbuf[i]->b_end_io = end_buffer_write_sync;
 182                 /* We use-up our safety reference in submit_bh() */
 183                 submit_bh(write_op, wbuf[i]);
 184         }
 185 }
 186
 187 /*
 188  *  Submit all the data buffers to disk
 189  */
 190 static int journal_submit_data_buffers(journal_t *journal,
 191                                        transaction_t *commit_transaction,
 192                                        int write_op)
 193 {
 194         struct journal_head *jh;
 195         struct buffer_head *bh;
 196         int locked;
 197         int bufs = 0;
 198         struct buffer_head **wbuf = journal->j_wbuf;
 199         int err = 0;
 200
 201         /*
 202          * Whenever we unlock the journal and sleep, things can get added
 203          * onto ->t_sync_datalist, so we have to keep looping back to
 204          * write_out_data until we *know* that the list is empty.
 205          *
 206          * Cleanup any flushed data buffers from the data list.  Even in
 207          * abort mode, we want to flush this out as soon as possible.
 208          */
 209 write_out_data:
 210         cond_resched();
 211         spin_lock(&journal->j_list_lock);
 212
 213         while (commit_transaction->t_sync_datalist) {
 214                 jh = commit_transaction->t_sync_datalist;
 215                 bh = jh2bh(jh);
 216                 locked = 0;
 217
 218                 /* Get reference just to make sure buffer does not disappear
 219                  * when we are forced to drop various locks */
 220                 get_bh(bh);
 221                 /* If the buffer is dirty, we need to submit IO and hence
 222                  * we need the buffer lock. We try to lock the buffer without
 223                  * blocking. If we fail, we need to drop j_list_lock and do
 224                  * blocking lock_buffer().
 225                  */
 226                 if (buffer_dirty(bh)) {
 227                         if (!trylock_buffer(bh)) {
 228                                 BUFFER_TRACE(bh, "needs blocking lock");
 229                                 spin_unlock(&journal->j_list_lock);
 230                                 /* Write out all data to prevent deadlocks */
 231                                 journal_do_submit_data(wbuf, bufs, write_op);
 232                                 bufs = 0;
 233                                 lock_buffer(bh);
 234                                 spin_lock(&journal->j_list_lock);
 235                         }
 236                         locked = 1;
 237                 }
 238                 /* We have to get bh_state lock. Again out of order, sigh. */
 239                 if (!inverted_lock(journal, bh)) {
 240                         jbd_lock_bh_state(bh);
 241                         spin_lock(&journal->j_list_lock);
 242                 }
 243                 /* Someone already cleaned up the buffer? */
 244                 if (!buffer_jbd(bh)
 245                         || jh->b_transaction != commit_transaction
 246                         || jh->b_jlist != BJ_SyncData) {
 247                         jbd_unlock_bh_state(bh);
 248                         if (locked)
 249                                 unlock_buffer(bh);
 250                         BUFFER_TRACE(bh, "already cleaned up");
 251                         release_data_buffer(bh);
 252                         continue;
 253                 }
 254                 if (locked && test_clear_buffer_dirty(bh)) {
 255                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 256                         wbuf[bufs++] = bh;
 257                         __journal_file_buffer(jh, commit_transaction,
 258                                                 BJ_Locked);
 259                         jbd_unlock_bh_state(bh);
 260                         if (bufs == journal->j_wbufsize) {
 261                                 spin_unlock(&journal->j_list_lock);
 262                                 journal_do_submit_data(wbuf, bufs, write_op);
 263                                 bufs = 0;
 264                                 goto write_out_data;
 265                         }
 266                 } else if (!locked && buffer_locked(bh)) {
 267                         __journal_file_buffer(jh, commit_transaction,
 268                                                 BJ_Locked);
 269                         jbd_unlock_bh_state(bh);
 270                         put_bh(bh);
 271                 } else {
 272                         BUFFER_TRACE(bh, "writeout complete: unfile");
 273                         if (unlikely(!buffer_uptodate(bh)))
 274                                 err = -EIO;
 275                         __journal_unfile_buffer(jh);
 276                         jbd_unlock_bh_state(bh);
 277                         if (locked)
 278                                 unlock_buffer(bh);
 279                         journal_remove_journal_head(bh);
 280                         /* One for our safety reference, other for
 281                          * journal_remove_journal_head() */
 282                         put_bh(bh);
 283                         release_data_buffer(bh);
 284                 }
 285
 286                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 287                         spin_unlock(&journal->j_list_lock);
 288                         goto write_out_data;
 289                 }
 290         }
 291         spin_unlock(&journal->j_list_lock);
 292         journal_do_submit_data(wbuf, bufs, write_op);
 293
 294         return err;
 295 }
 296
 297 /*
 298  * journal_commit_transaction
 299  *
 300  * The primary function for committing a transaction to the log.  This
 301  * function is called by the journal thread to begin a complete commit.
 302  */
 303 void journal_commit_transaction(journal_t *journal)
 304 {
 305         transaction_t *commit_transaction;
 306         struct journal_head *jh, *new_jh, *descriptor;
 307         struct buffer_head **wbuf = journal->j_wbuf;
 308         int bufs;
 309         int flags;
 310         int err;
 311         unsigned long blocknr;
 312         ktime_t start_time;
 313         u64 commit_time;
 314         char *tagp = NULL;
 315         journal_header_t *header;
 316         journal_block_tag_t *tag = NULL;
 317         int space_left = 0;
 318         int first_tag = 0;
 319         int tag_flag;
 320         int i;
 321         int write_op = WRITE;
 322
 323         /*
 324          * First job: lock down the current transaction and wait for
 325          * all outstanding updates to complete.
 326          */
 327
 328 #ifdef COMMIT_STATS
 329         spin_lock(&journal->j_list_lock);
 330         summarise_journal_usage(journal);
 331         spin_unlock(&journal->j_list_lock);
 332 #endif
 333
 334         /* Do we need to erase the effects of a prior journal_flush? */
 335         if (journal->j_flags & JFS_FLUSHED) {
 336                 jbd_debug(3, "super block updated\n");
 337                 journal_update_superblock(journal, 1);
 338         } else {
 339                 jbd_debug(3, "superblock not updated\n");
 340         }
 341
 342         J_ASSERT(journal->j_running_transaction != NULL);
 343         J_ASSERT(journal->j_committing_transaction == NULL);
 344
 345         commit_transaction = journal->j_running_transaction;
 346         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 347
 348         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 349                         commit_transaction->t_tid);
 350
 351         spin_lock(&journal->j_state_lock);
 352         commit_transaction->t_state = T_LOCKED;
 353
 354         if (commit_transaction->t_synchronous_commit)
 355                 write_op = WRITE_SYNC;
 356         spin_lock(&commit_transaction->t_handle_lock);
 357         while (commit_transaction->t_updates) {
 358                 DEFINE_WAIT(wait);
 359
 360                 prepare_to_wait(&journal->j_wait_updates, &wait,
 361                                         TASK_UNINTERRUPTIBLE);
 362                 if (commit_transaction->t_updates) {
 363                         spin_unlock(&commit_transaction->t_handle_lock);
 364                         spin_unlock(&journal->j_state_lock);
 365                         schedule();
 366                         spin_lock(&journal->j_state_lock);
 367                         spin_lock(&commit_transaction->t_handle_lock);
 368                 }
 369                 finish_wait(&journal->j_wait_updates, &wait);
 370         }
 371         spin_unlock(&commit_transaction->t_handle_lock);
 372
 373         J_ASSERT (commit_transaction->t_outstanding_credits <=
 374                         journal->j_max_transaction_buffers);
 375
 376         /*
 377          * First thing we are allowed to do is to discard any remaining
 378          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 379          * that there are no such buffers: if a large filesystem
 380          * operation like a truncate needs to split itself over multiple
 381          * transactions, then it may try to do a journal_restart() while
 382          * there are still BJ_Reserved buffers outstanding.  These must
 383          * be released cleanly from the current transaction.
 384          *
 385          * In this case, the filesystem must still reserve write access
 386          * again before modifying the buffer in the new transaction, but
 387          * we do not require it to remember exactly which old buffers it
 388          * has reserved.  This is consistent with the existing behaviour
 389          * that multiple journal_get_write_access() calls to the same
 390          * buffer are perfectly permissable.
 391          */
 392         while (commit_transaction->t_reserved_list) {
 393                 jh = commit_transaction->t_reserved_list;
 394                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 395                 /*
 396                  * A journal_get_undo_access()+journal_release_buffer() may
 397                  * leave undo-committed data.
 398                  */
 399                 if (jh->b_committed_data) {
 400                         struct buffer_head *bh = jh2bh(jh);
 401
 402                         jbd_lock_bh_state(bh);
 403                         jbd_free(jh->b_committed_data, bh->b_size);
 404                         jh->b_committed_data = NULL;
 405                         jbd_unlock_bh_state(bh);
 406                 }
 407                 journal_refile_buffer(journal, jh);
 408         }
 409
 410         /*
 411          * Now try to drop any written-back buffers from the journal's
 412          * checkpoint lists.  We do this *before* commit because it potentially
 413          * frees some memory
 414          */
 415         spin_lock(&journal->j_list_lock);
 416         __journal_clean_checkpoint_list(journal);
 417         spin_unlock(&journal->j_list_lock);
 418
 419         jbd_debug (3, "JBD: commit phase 1\n");
 420
 421         /*
 422          * Switch to a new revoke table.
 423          */
 424         journal_switch_revoke_table(journal);
 425
 426         commit_transaction->t_state = T_FLUSH;
 427         journal->j_committing_transaction = commit_transaction;
 428         journal->j_running_transaction = NULL;
 429         start_time = ktime_get();
 430         commit_transaction->t_log_start = journal->j_head;
 431         wake_up(&journal->j_wait_transaction_locked);
 432         spin_unlock(&journal->j_state_lock);
 433
 434         jbd_debug (3, "JBD: commit phase 2\n");
 435
 436         /*
 437          * Now start flushing things to disk, in the order they appear
 438          * on the transaction lists.  Data blocks go first.
 439          */
 440         err = journal_submit_data_buffers(journal, commit_transaction,
 441                                           write_op);
 442
 443         /*
 444          * Wait for all previously submitted IO to complete.
 445          */
 446         spin_lock(&journal->j_list_lock);
 447         while (commit_transaction->t_locked_list) {
 448                 struct buffer_head *bh;
 449
 450                 jh = commit_transaction->t_locked_list->b_tprev;
 451                 bh = jh2bh(jh);
 452                 get_bh(bh);
 453                 if (buffer_locked(bh)) {
 454                         spin_unlock(&journal->j_list_lock);
 455                         wait_on_buffer(bh);
 456                         spin_lock(&journal->j_list_lock);
 457                 }
 458                 if (unlikely(!buffer_uptodate(bh))) {
 459                         if (!trylock_page(bh->b_page)) {
 460                                 spin_unlock(&journal->j_list_lock);
 461                                 lock_page(bh->b_page);
 462                                 spin_lock(&journal->j_list_lock);
 463                         }
 464                         if (bh->b_page->mapping)
 465                                 set_bit(AS_EIO, &bh->b_page->mapping->flags);
 466
 467                         unlock_page(bh->b_page);
 468                         SetPageError(bh->b_page);
 469                         err = -EIO;
 470                 }
 471                 if (!inverted_lock(journal, bh)) {
 472                         put_bh(bh);
 473                         spin_lock(&journal->j_list_lock);
 474                         continue;
 475                 }
 476                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 477                         __journal_unfile_buffer(jh);
 478                         jbd_unlock_bh_state(bh);
 479                         journal_remove_journal_head(bh);
 480                         put_bh(bh);
 481                 } else {
 482                         jbd_unlock_bh_state(bh);
 483                 }
 484                 release_data_buffer(bh);
 485                 cond_resched_lock(&journal->j_list_lock);
 486         }
 487         spin_unlock(&journal->j_list_lock);
 488
 489         if (err) {
 490                 char b[BDEVNAME_SIZE];
 491
 492                 printk(KERN_WARNING
 493                         "JBD: Detected IO errors while flushing file data "
 494                         "on %s\n", bdevname(journal->j_fs_dev, b));
 495                 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
 496                         journal_abort(journal, err);
 497                 err = 0;
 498         }
 499
 500         journal_write_revoke_records(journal, commit_transaction);
 501
 502         /*
 503          * If we found any dirty or locked buffers, then we should have
 504          * looped back up to the write_out_data label.  If there weren't
 505          * any then journal_clean_data_list should have wiped the list
 506          * clean by now, so check that it is in fact empty.
 507          */
 508         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 509
 510         jbd_debug (3, "JBD: commit phase 3\n");
 511
 512         /*
 513          * Way to go: we have now written out all of the data for a
 514          * transaction!  Now comes the tricky part: we need to write out
 515          * metadata.  Loop over the transaction's entire buffer list:
 516          */
 517         spin_lock(&journal->j_state_lock);
 518         commit_transaction->t_state = T_COMMIT;
 519         spin_unlock(&journal->j_state_lock);
 520
 521         J_ASSERT(commit_transaction->t_nr_buffers <=
 522                  commit_transaction->t_outstanding_credits);
 523
 524         descriptor = NULL;
 525         bufs = 0;
 526         while (commit_transaction->t_buffers) {
 527
 528                 /* Find the next buffer to be journaled... */
 529
 530                 jh = commit_transaction->t_buffers;
 531
 532                 /* If we're in abort mode, we just un-journal the buffer and
 533                    release it. */
 534
 535                 if (is_journal_aborted(journal)) {
 536                         clear_buffer_jbddirty(jh2bh(jh));
 537                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 538                         journal_refile_buffer(journal, jh);
 539                         /* If that was the last one, we need to clean up
 540                          * any descriptor buffers which may have been
 541                          * already allocated, even if we are now
 542                          * aborting. */
 543                         if (!commit_transaction->t_buffers)
 544                                 goto start_journal_io;
 545                         continue;
 546                 }
 547
 548                 /* Make sure we have a descriptor block in which to
 549                    record the metadata buffer. */
 550
 551                 if (!descriptor) {
 552                         struct buffer_head *bh;
 553
 554                         J_ASSERT (bufs == 0);
 555
 556                         jbd_debug(4, "JBD: get descriptor\n");
 557
 558                         descriptor = journal_get_descriptor_buffer(journal);
 559                         if (!descriptor) {
 560                                 journal_abort(journal, -EIO);
 561                                 continue;
 562                         }
 563
 564                         bh = jh2bh(descriptor);
 565                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 566                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 567                         header = (journal_header_t *)&bh->b_data[0];
 568                         header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
 569                         header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
 570                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 571
 572                         tagp = &bh->b_data[sizeof(journal_header_t)];
 573                         space_left = bh->b_size - sizeof(journal_header_t);
 574                         first_tag = 1;
 575                         set_buffer_jwrite(bh);
 576                         set_buffer_dirty(bh);
 577                         wbuf[bufs++] = bh;
 578
 579                         /* Record it so that we can wait for IO
 580                            completion later */
 581                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 582                         journal_file_buffer(descriptor, commit_transaction,
 583                                         BJ_LogCtl);
 584                 }
 585
 586                 /* Where is the buffer to be written? */
 587
 588                 err = journal_next_log_block(journal, &blocknr);
 589                 /* If the block mapping failed, just abandon the buffer
 590                    and repeat this loop: we'll fall into the
 591                    refile-on-abort condition above. */
 592                 if (err) {
 593                         journal_abort(journal, err);
 594                         continue;
 595                 }
 596
 597                 /*
 598                  * start_this_handle() uses t_outstanding_credits to determine
 599                  * the free space in the log, but this counter is changed
 600                  * by journal_next_log_block() also.
 601                  */
 602                 commit_transaction->t_outstanding_credits--;
 603
 604                 /* Bump b_count to prevent truncate from stumbling over
 605                    the shadowed buffer!  @@@ This can go if we ever get
 606                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 607                 atomic_inc(&jh2bh(jh)->b_count);
 608
 609                 /* Make a temporary IO buffer with which to write it out
 610                    (this will requeue both the metadata buffer and the
 611                    temporary IO buffer). new_bh goes on BJ_IO*/
 612
 613                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 614                 /*
 615                  * akpm: journal_write_metadata_buffer() sets
 616                  * new_bh->b_transaction to commit_transaction.
 617                  * We need to clean this up before we release new_bh
 618                  * (which is of type BJ_IO)
 619                  */
 620                 JBUFFER_TRACE(jh, "ph3: write metadata");
 621                 flags = journal_write_metadata_buffer(commit_transaction,
 622                                                       jh, &new_jh, blocknr);
 623                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 624                 wbuf[bufs++] = jh2bh(new_jh);
 625
 626                 /* Record the new block's tag in the current descriptor
 627                    buffer */
 628
 629                 tag_flag = 0;
 630                 if (flags & 1)
 631                         tag_flag |= JFS_FLAG_ESCAPE;
 632                 if (!first_tag)
 633                         tag_flag |= JFS_FLAG_SAME_UUID;
 634
 635                 tag = (journal_block_tag_t *) tagp;
 636                 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
 637                 tag->t_flags = cpu_to_be32(tag_flag);
 638                 tagp += sizeof(journal_block_tag_t);
 639                 space_left -= sizeof(journal_block_tag_t);
 640
 641                 if (first_tag) {
 642                         memcpy (tagp, journal->j_uuid, 16);
 643                         tagp += 16;
 644                         space_left -= 16;
 645                         first_tag = 0;
 646                 }
 647
 648                 /* If there's no more to do, or if the descriptor is full,
 649                    let the IO rip! */
 650
 651                 if (bufs == journal->j_wbufsize ||
 652                     commit_transaction->t_buffers == NULL ||
 653                     space_left < sizeof(journal_block_tag_t) + 16) {
 654
 655                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 656
 657                         /* Write an end-of-descriptor marker before
 658                            submitting the IOs.  "tag" still points to
 659                            the last tag we set up. */
 660
 661                         tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
 662
 663 start_journal_io:
 664                         for (i = 0; i < bufs; i++) {
 665                                 struct buffer_head *bh = wbuf[i];
 666                                 lock_buffer(bh);
 667                                 clear_buffer_dirty(bh);
 668                                 set_buffer_uptodate(bh);
 669                                 bh->b_end_io = journal_end_buffer_io_sync;
 670                                 submit_bh(write_op, bh);
 671                         }
 672                         cond_resched();
 673
 674                         /* Force a new descriptor to be generated next
 675                            time round the loop. */
 676                         descriptor = NULL;
 677                         bufs = 0;
 678                 }
 679         }
 680
 681         /* Lo and behold: we have just managed to send a transaction to
 682            the log.  Before we can commit it, wait for the IO so far to
 683            complete.  Control buffers being written are on the
 684            transaction's t_log_list queue, and metadata buffers are on
 685            the t_iobuf_list queue.
 686
 687            Wait for the buffers in reverse order.  That way we are
 688            less likely to be woken up until all IOs have completed, and
 689            so we incur less scheduling load.
 690         */
 691
 692         jbd_debug(3, "JBD: commit phase 4\n");
 693
 694         /*
 695          * akpm: these are BJ_IO, and j_list_lock is not needed.
 696          * See __journal_try_to_free_buffer.
 697          */
 698 wait_for_iobuf:
 699         while (commit_transaction->t_iobuf_list != NULL) {
 700                 struct buffer_head *bh;
 701
 702                 jh = commit_transaction->t_iobuf_list->b_tprev;
 703                 bh = jh2bh(jh);
 704                 if (buffer_locked(bh)) {
 705                         wait_on_buffer(bh);
 706                         goto wait_for_iobuf;
 707                 }
 708                 if (cond_resched())
 709                         goto wait_for_iobuf;
 710
 711                 if (unlikely(!buffer_uptodate(bh)))
 712                         err = -EIO;
 713
 714                 clear_buffer_jwrite(bh);
 715
 716                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 717                 journal_unfile_buffer(journal, jh);
 718
 719                 /*
 720                  * ->t_iobuf_list should contain only dummy buffer_heads
 721                  * which were created by journal_write_metadata_buffer().
 722                  */
 723                 BUFFER_TRACE(bh, "dumping temporary bh");
 724                 journal_put_journal_head(jh);
 725                 __brelse(bh);
 726                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 727                 free_buffer_head(bh);
 728
 729                 /* We also have to unlock and free the corresponding
 730                    shadowed buffer */
 731                 jh = commit_transaction->t_shadow_list->b_tprev;
 732                 bh = jh2bh(jh);
 733                 clear_bit(BH_JWrite, &bh->b_state);
 734                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 735
 736                 /* The metadata is now released for reuse, but we need
 737                    to remember it against this transaction so that when
 738                    we finally commit, we can do any checkpointing
 739                    required. */
 740                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 741                 journal_file_buffer(jh, commit_transaction, BJ_Forget);
 742                 /* Wake up any transactions which were waiting for this
 743                    IO to complete */
 744                 wake_up_bit(&bh->b_state, BH_Unshadow);
 745                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 746                 __brelse(bh);
 747         }
 748
 749         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 750
 751         jbd_debug(3, "JBD: commit phase 5\n");
 752
 753         /* Here we wait for the revoke record and descriptor record buffers */
 754  wait_for_ctlbuf:
 755         while (commit_transaction->t_log_list != NULL) {
 756                 struct buffer_head *bh;
 757
 758                 jh = commit_transaction->t_log_list->b_tprev;
 759                 bh = jh2bh(jh);
 760                 if (buffer_locked(bh)) {
 761                         wait_on_buffer(bh);
 762                         goto wait_for_ctlbuf;
 763                 }
 764                 if (cond_resched())
 765                         goto wait_for_ctlbuf;
 766
 767                 if (unlikely(!buffer_uptodate(bh)))
 768                         err = -EIO;
 769
 770                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 771                 clear_buffer_jwrite(bh);
 772                 journal_unfile_buffer(journal, jh);
 773                 journal_put_journal_head(jh);
 774                 __brelse(bh);           /* One for getblk */
 775                 /* AKPM: bforget here */
 776         }
 777
 778         if (err)
 779                 journal_abort(journal, err);
 780
 781         jbd_debug(3, "JBD: commit phase 6\n");
 782
 783         if (journal_write_commit_record(journal, commit_transaction))
 784                 err = -EIO;
 785
 786         if (err)
 787                 journal_abort(journal, err);
 788
 789         /* End of a transaction!  Finally, we can do checkpoint
 790            processing: any buffers committed as a result of this
 791            transaction can be removed from any checkpoint list it was on
 792            before. */
 793
 794         jbd_debug(3, "JBD: commit phase 7\n");
 795
 796         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 797         J_ASSERT(commit_transaction->t_buffers == NULL);
 798         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 799         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 800         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 801         J_ASSERT(commit_transaction->t_log_list == NULL);
 802
 803 restart_loop:
 804         /*
 805          * As there are other places (journal_unmap_buffer()) adding buffers
 806          * to this list we have to be careful and hold the j_list_lock.
 807          */
 808         spin_lock(&journal->j_list_lock);
 809         while (commit_transaction->t_forget) {
 810                 transaction_t *cp_transaction;
 811                 struct buffer_head *bh;
 812
 813                 jh = commit_transaction->t_forget;
 814                 spin_unlock(&journal->j_list_lock);
 815                 bh = jh2bh(jh);
 816                 jbd_lock_bh_state(bh);
 817                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 818                         jh->b_transaction == journal->j_running_transaction);
 819
 820                 /*
 821                  * If there is undo-protected committed data against
 822                  * this buffer, then we can remove it now.  If it is a
 823                  * buffer needing such protection, the old frozen_data
 824                  * field now points to a committed version of the
 825                  * buffer, so rotate that field to the new committed
 826                  * data.
 827                  *
 828                  * Otherwise, we can just throw away the frozen data now.
 829                  */
 830                 if (jh->b_committed_data) {
 831                         jbd_free(jh->b_committed_data, bh->b_size);
 832                         jh->b_committed_data = NULL;
 833                         if (jh->b_frozen_data) {
 834                                 jh->b_committed_data = jh->b_frozen_data;
 835                                 jh->b_frozen_data = NULL;
 836                         }
 837                 } else if (jh->b_frozen_data) {
 838                         jbd_free(jh->b_frozen_data, bh->b_size);
 839                         jh->b_frozen_data = NULL;
 840                 }
 841
 842                 spin_lock(&journal->j_list_lock);
 843                 cp_transaction = jh->b_cp_transaction;
 844                 if (cp_transaction) {
 845                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 846                         __journal_remove_checkpoint(jh);
 847                 }
 848
 849                 /* Only re-checkpoint the buffer_head if it is marked
 850                  * dirty.  If the buffer was added to the BJ_Forget list
 851                  * by journal_forget, it may no longer be dirty and
 852                  * there's no point in keeping a checkpoint record for
 853                  * it. */
 854
 855                 /* A buffer which has been freed while still being
 856                  * journaled by a previous transaction may end up still
 857                  * being dirty here, but we want to avoid writing back
 858                  * that buffer in the future now that the last use has
 859                  * been committed.  That's not only a performance gain,
 860                  * it also stops aliasing problems if the buffer is left
 861                  * behind for writeback and gets reallocated for another
 862                  * use in a different page. */
 863                 if (buffer_freed(bh)) {
 864                         clear_buffer_freed(bh);
 865                         clear_buffer_jbddirty(bh);
 866                 }
 867
 868                 if (buffer_jbddirty(bh)) {
 869                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 870                         __journal_insert_checkpoint(jh, commit_transaction);
 871                         if (is_journal_aborted(journal))
 872                                 clear_buffer_jbddirty(bh);
 873                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 874                         __journal_refile_buffer(jh);
 875                         jbd_unlock_bh_state(bh);
 876                 } else {
 877                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 878                         /* The buffer on BJ_Forget list and not jbddirty means
 879                          * it has been freed by this transaction and hence it
 880                          * could not have been reallocated until this
 881                          * transaction has committed. *BUT* it could be
 882                          * reallocated once we have written all the data to
 883                          * disk and before we process the buffer on BJ_Forget
 884                          * list. */
 885                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 886                         __journal_refile_buffer(jh);
 887                         if (!jh->b_transaction) {
 888                                 jbd_unlock_bh_state(bh);
 889                                  /* needs a brelse */
 890                                 journal_remove_journal_head(bh);
 891                                 release_buffer_page(bh);
 892                         } else
 893                                 jbd_unlock_bh_state(bh);
 894                 }
 895                 cond_resched_lock(&journal->j_list_lock);
 896         }
 897         spin_unlock(&journal->j_list_lock);
 898         /*
 899          * This is a bit sleazy.  We use j_list_lock to protect transition
 900          * of a transaction into T_FINISHED state and calling
 901          * __journal_drop_transaction(). Otherwise we could race with
 902          * other checkpointing code processing the transaction...
 903          */
 904         spin_lock(&journal->j_state_lock);
 905         spin_lock(&journal->j_list_lock);
 906         /*
 907          * Now recheck if some buffers did not get attached to the transaction
 908          * while the lock was dropped...
 909          */
 910         if (commit_transaction->t_forget) {
 911                 spin_unlock(&journal->j_list_lock);
 912                 spin_unlock(&journal->j_state_lock);
 913                 goto restart_loop;
 914         }
 915
 916         /* Done with this transaction! */
 917
 918         jbd_debug(3, "JBD: commit phase 8\n");
 919
 920         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 921
 922         commit_transaction->t_state = T_FINISHED;
 923         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 924         journal->j_commit_sequence = commit_transaction->t_tid;
 925         journal->j_committing_transaction = NULL;
 926         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 927
 928         /*
 929          * weight the commit time higher than the average time so we don't
 930          * react too strongly to vast changes in commit time
 931          */
 932         if (likely(journal->j_average_commit_time))
 933                 journal->j_average_commit_time = (commit_time*3 +
 934                                 journal->j_average_commit_time) / 4;
 935         else
 936                 journal->j_average_commit_time = commit_time;
 937
 938         spin_unlock(&journal->j_state_lock);
 939
 940         if (commit_transaction->t_checkpoint_list == NULL &&
 941             commit_transaction->t_checkpoint_io_list == NULL) {
 942                 __journal_drop_transaction(journal, commit_transaction);
 943         } else {
 944                 if (journal->j_checkpoint_transactions == NULL) {
 945                         journal->j_checkpoint_transactions = commit_transaction;
 946                         commit_transaction->t_cpnext = commit_transaction;
 947                         commit_transaction->t_cpprev = commit_transaction;
 948                 } else {
 949                         commit_transaction->t_cpnext =
 950                                 journal->j_checkpoint_transactions;
 951                         commit_transaction->t_cpprev =
 952                                 commit_transaction->t_cpnext->t_cpprev;
 953                         commit_transaction->t_cpnext->t_cpprev =
 954                                 commit_transaction;
 955                         commit_transaction->t_cpprev->t_cpnext =
 956                                 commit_transaction;
 957                 }
 958         }
 959         spin_unlock(&journal->j_list_lock);
 960
 961         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 962                   journal->j_commit_sequence, journal->j_tail_sequence);
 963
 964         wake_up(&journal->j_wait_done_commit);
 965 }