fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25
  26 /*
  27  * Default IO end handler for temporary BJ_IO buffer_heads.
  28  */
  29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30 {
  31         BUFFER_TRACE(bh, "");
  32         if (uptodate)
  33                 set_buffer_uptodate(bh);
  34         else
  35                 clear_buffer_uptodate(bh);
  36         unlock_buffer(bh);
  37 }
  38
  39 /*
  40  * When an ext3-ordered file is truncated, it is possible that many pages are
  41  * not sucessfully freed, because they are attached to a committing transaction.
  42  * After the transaction commits, these pages are left on the LRU, with no
  43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45  * the numbers in /proc/meminfo look odd.
  46  *
  47  * So here, we have a buffer which has just come off the forget list.  Look to
  48  * see if we can strip all buffers from the backing page.
  49  *
  50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51  * caller provided us with a ref against the buffer, and we drop that here.
  52  */
  53 static void release_buffer_page(struct buffer_head *bh)
  54 {
  55         struct page *page;
  56
  57         if (buffer_dirty(bh))
  58                 goto nope;
  59         if (atomic_read(&bh->b_count) != 1)
  60                 goto nope;
  61         page = bh->b_page;
  62         if (!page)
  63                 goto nope;
  64         if (page->mapping)
  65                 goto nope;
  66
  67         /* OK, it's a truncated page */
  68         if (TestSetPageLocked(page))
  69                 goto nope;
  70
  71         page_cache_get(page);
  72         __brelse(bh);
  73         try_to_free_buffers(page);
  74         unlock_page(page);
  75         page_cache_release(page);
  76         return;
  77
  78 nope:
  79         __brelse(bh);
  80 }
  81
  82 /*
  83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  85  * return 0.  j_list_lock is dropped in this case.
  86  */
  87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  88 {
  89         if (!jbd_trylock_bh_state(bh)) {
  90                 spin_unlock(&journal->j_list_lock);
  91                 schedule();
  92                 return 0;
  93         }
  94         return 1;
  95 }
  96
  97 /*
  98  * Done it all: now submit the commit record.  We should have
  99  * cleaned up our previous buffers by now, so if we are in abort
 100  * mode we can now just skip the rest of the journal write
 101  * entirely.
 102  *
 103  * Returns 1 if the journal needs to be aborted or 0 on success
 104  */
 105 static int journal_submit_commit_record(journal_t *journal,
 106                                         transaction_t *commit_transaction,
 107                                         struct buffer_head **cbh,
 108                                         __u32 crc32_sum)
 109 {
 110         struct journal_head *descriptor;
 111         struct commit_header *tmp;
 112         struct buffer_head *bh;
 113         int ret;
 114         int barrier_done = 0;
 115
 116         if (is_journal_aborted(journal))
 117                 return 0;
 118
 119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 120         if (!descriptor)
 121                 return 1;
 122
 123         bh = jh2bh(descriptor);
 124
 125         tmp = (struct commit_header *)bh->b_data;
 126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 129
 130         if (JBD2_HAS_COMPAT_FEATURE(journal,
 131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 135         }
 136
 137         JBUFFER_TRACE(descriptor, "submit commit block");
 138         lock_buffer(bh);
 139         get_bh(bh);
 140         set_buffer_dirty(bh);
 141         set_buffer_uptodate(bh);
 142         bh->b_end_io = journal_end_buffer_io_sync;
 143
 144         if (journal->j_flags & JBD2_BARRIER &&
 145                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 147                 set_buffer_ordered(bh);
 148                 barrier_done = 1;
 149         }
 150         ret = submit_bh(WRITE, bh);
 151         if (barrier_done)
 152                 clear_buffer_ordered(bh);
 153
 154         /* is it possible for another commit to fail at roughly
 155          * the same time as this one?  If so, we don't want to
 156          * trust the barrier flag in the super, but instead want
 157          * to remember if we sent a barrier request
 158          */
 159         if (ret == -EOPNOTSUPP && barrier_done) {
 160                 char b[BDEVNAME_SIZE];
 161
 162                 printk(KERN_WARNING
 163                         "JBD: barrier-based sync failed on %s - "
 164                         "disabling barriers\n",
 165                         bdevname(journal->j_dev, b));
 166                 spin_lock(&journal->j_state_lock);
 167                 journal->j_flags &= ~JBD2_BARRIER;
 168                 spin_unlock(&journal->j_state_lock);
 169
 170                 /* And try again, without the barrier */
 171                 lock_buffer(bh);
 172                 set_buffer_uptodate(bh);
 173                 set_buffer_dirty(bh);
 174                 ret = submit_bh(WRITE, bh);
 175         }
 176         *cbh = bh;
 177         return ret;
 178 }
 179
 180 /*
 181  * This function along with journal_submit_commit_record
 182  * allows to write the commit record asynchronously.
 183  */
 184 static int journal_wait_on_commit_record(struct buffer_head *bh)
 185 {
 186         int ret = 0;
 187
 188         clear_buffer_dirty(bh);
 189         wait_on_buffer(bh);
 190
 191         if (unlikely(!buffer_uptodate(bh)))
 192                 ret = -EIO;
 193         put_bh(bh);            /* One for getblk() */
 194         jbd2_journal_put_journal_head(bh2jh(bh));
 195
 196         return ret;
 197 }
 198
 199 /*
 200  * Wait for all submitted IO to complete.
 201  */
 202 static int journal_wait_on_locked_list(journal_t *journal,
 203                                        transaction_t *commit_transaction)
 204 {
 205         int ret = 0;
 206         struct journal_head *jh;
 207
 208         while (commit_transaction->t_locked_list) {
 209                 struct buffer_head *bh;
 210
 211                 jh = commit_transaction->t_locked_list->b_tprev;
 212                 bh = jh2bh(jh);
 213                 get_bh(bh);
 214                 if (buffer_locked(bh)) {
 215                         spin_unlock(&journal->j_list_lock);
 216                         wait_on_buffer(bh);
 217                         if (unlikely(!buffer_uptodate(bh)))
 218                                 ret = -EIO;
 219                         spin_lock(&journal->j_list_lock);
 220                 }
 221                 if (!inverted_lock(journal, bh)) {
 222                         put_bh(bh);
 223                         spin_lock(&journal->j_list_lock);
 224                         continue;
 225                 }
 226                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 227                         __jbd2_journal_unfile_buffer(jh);
 228                         jbd_unlock_bh_state(bh);
 229                         jbd2_journal_remove_journal_head(bh);
 230                         put_bh(bh);
 231                 } else {
 232                         jbd_unlock_bh_state(bh);
 233                 }
 234                 put_bh(bh);
 235                 cond_resched_lock(&journal->j_list_lock);
 236         }
 237         return ret;
 238   }
 239
 240 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 241 {
 242         int i;
 243
 244         for (i = 0; i < bufs; i++) {
 245                 wbuf[i]->b_end_io = end_buffer_write_sync;
 246                 /* We use-up our safety reference in submit_bh() */
 247                 submit_bh(WRITE, wbuf[i]);
 248         }
 249 }
 250
 251 /*
 252  *  Submit all the data buffers to disk
 253  */
 254 static void journal_submit_data_buffers(journal_t *journal,
 255                                 transaction_t *commit_transaction)
 256 {
 257         struct journal_head *jh;
 258         struct buffer_head *bh;
 259         int locked;
 260         int bufs = 0;
 261         struct buffer_head **wbuf = journal->j_wbuf;
 262
 263         /*
 264          * Whenever we unlock the journal and sleep, things can get added
 265          * onto ->t_sync_datalist, so we have to keep looping back to
 266          * write_out_data until we *know* that the list is empty.
 267          *
 268          * Cleanup any flushed data buffers from the data list.  Even in
 269          * abort mode, we want to flush this out as soon as possible.
 270          */
 271 write_out_data:
 272         cond_resched();
 273         spin_lock(&journal->j_list_lock);
 274
 275         while (commit_transaction->t_sync_datalist) {
 276                 jh = commit_transaction->t_sync_datalist;
 277                 bh = jh2bh(jh);
 278                 locked = 0;
 279
 280                 /* Get reference just to make sure buffer does not disappear
 281                  * when we are forced to drop various locks */
 282                 get_bh(bh);
 283                 /* If the buffer is dirty, we need to submit IO and hence
 284                  * we need the buffer lock. We try to lock the buffer without
 285                  * blocking. If we fail, we need to drop j_list_lock and do
 286                  * blocking lock_buffer().
 287                  */
 288                 if (buffer_dirty(bh)) {
 289                         if (test_set_buffer_locked(bh)) {
 290                                 BUFFER_TRACE(bh, "needs blocking lock");
 291                                 spin_unlock(&journal->j_list_lock);
 292                                 /* Write out all data to prevent deadlocks */
 293                                 journal_do_submit_data(wbuf, bufs);
 294                                 bufs = 0;
 295                                 lock_buffer(bh);
 296                                 spin_lock(&journal->j_list_lock);
 297                         }
 298                         locked = 1;
 299                 }
 300                 /* We have to get bh_state lock. Again out of order, sigh. */
 301                 if (!inverted_lock(journal, bh)) {
 302                         jbd_lock_bh_state(bh);
 303                         spin_lock(&journal->j_list_lock);
 304                 }
 305                 /* Someone already cleaned up the buffer? */
 306                 if (!buffer_jbd(bh)
 307                         || jh->b_transaction != commit_transaction
 308                         || jh->b_jlist != BJ_SyncData) {
 309                         jbd_unlock_bh_state(bh);
 310                         if (locked)
 311                                 unlock_buffer(bh);
 312                         BUFFER_TRACE(bh, "already cleaned up");
 313                         put_bh(bh);
 314                         continue;
 315                 }
 316                 if (locked && test_clear_buffer_dirty(bh)) {
 317                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 318                         wbuf[bufs++] = bh;
 319                         __jbd2_journal_file_buffer(jh, commit_transaction,
 320                                                 BJ_Locked);
 321                         jbd_unlock_bh_state(bh);
 322                         if (bufs == journal->j_wbufsize) {
 323                                 spin_unlock(&journal->j_list_lock);
 324                                 journal_do_submit_data(wbuf, bufs);
 325                                 bufs = 0;
 326                                 goto write_out_data;
 327                         }
 328                 } else if (!locked && buffer_locked(bh)) {
 329                         __jbd2_journal_file_buffer(jh, commit_transaction,
 330                                                 BJ_Locked);
 331                         jbd_unlock_bh_state(bh);
 332                         put_bh(bh);
 333                 } else {
 334                         BUFFER_TRACE(bh, "writeout complete: unfile");
 335                         __jbd2_journal_unfile_buffer(jh);
 336                         jbd_unlock_bh_state(bh);
 337                         if (locked)
 338                                 unlock_buffer(bh);
 339                         jbd2_journal_remove_journal_head(bh);
 340                         /* Once for our safety reference, once for
 341                          * jbd2_journal_remove_journal_head() */
 342                         put_bh(bh);
 343                         put_bh(bh);
 344                 }
 345
 346                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
 347                         spin_unlock(&journal->j_list_lock);
 348                         goto write_out_data;
 349                 }
 350         }
 351         spin_unlock(&journal->j_list_lock);
 352         journal_do_submit_data(wbuf, bufs);
 353 }
 354
 355 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 356 {
 357         struct page *page = bh->b_page;
 358         char *addr;
 359         __u32 checksum;
 360
 361         addr = kmap_atomic(page, KM_USER0);
 362         checksum = crc32_be(crc32_sum,
 363                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 364         kunmap_atomic(addr, KM_USER0);
 365
 366         return checksum;
 367 }
 368
 369 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 370                                    unsigned long long block)
 371 {
 372         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 373         if (tag_bytes > JBD2_TAG_SIZE32)
 374                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 375 }
 376
 377 /*
 378  * jbd2_journal_commit_transaction
 379  *
 380  * The primary function for committing a transaction to the log.  This
 381  * function is called by the journal thread to begin a complete commit.
 382  */
 383 void jbd2_journal_commit_transaction(journal_t *journal)
 384 {
 385         struct transaction_stats_s stats;
 386         transaction_t *commit_transaction;
 387         struct journal_head *jh, *new_jh, *descriptor;
 388         struct buffer_head **wbuf = journal->j_wbuf;
 389         int bufs;
 390         int flags;
 391         int err;
 392         unsigned long long blocknr;
 393         char *tagp = NULL;
 394         journal_header_t *header;
 395         journal_block_tag_t *tag = NULL;
 396         int space_left = 0;
 397         int first_tag = 0;
 398         int tag_flag;
 399         int i;
 400         int tag_bytes = journal_tag_bytes(journal);
 401         struct buffer_head *cbh = NULL; /* For transactional checksums */
 402         __u32 crc32_sum = ~0;
 403
 404         /*
 405          * First job: lock down the current transaction and wait for
 406          * all outstanding updates to complete.
 407          */
 408
 409 #ifdef COMMIT_STATS
 410         spin_lock(&journal->j_list_lock);
 411         summarise_journal_usage(journal);
 412         spin_unlock(&journal->j_list_lock);
 413 #endif
 414
 415         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 416         if (journal->j_flags & JBD2_FLUSHED) {
 417                 jbd_debug(3, "super block updated\n");
 418                 jbd2_journal_update_superblock(journal, 1);
 419         } else {
 420                 jbd_debug(3, "superblock not updated\n");
 421         }
 422
 423         J_ASSERT(journal->j_running_transaction != NULL);
 424         J_ASSERT(journal->j_committing_transaction == NULL);
 425
 426         commit_transaction = journal->j_running_transaction;
 427         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 428
 429         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 430                         commit_transaction->t_tid);
 431
 432         spin_lock(&journal->j_state_lock);
 433         commit_transaction->t_state = T_LOCKED;
 434
 435         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 436         stats.u.run.rs_locked = jiffies;
 437         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 438                                                 stats.u.run.rs_locked);
 439
 440         spin_lock(&commit_transaction->t_handle_lock);
 441         while (commit_transaction->t_updates) {
 442                 DEFINE_WAIT(wait);
 443
 444                 prepare_to_wait(&journal->j_wait_updates, &wait,
 445                                         TASK_UNINTERRUPTIBLE);
 446                 if (commit_transaction->t_updates) {
 447                         spin_unlock(&commit_transaction->t_handle_lock);
 448                         spin_unlock(&journal->j_state_lock);
 449                         schedule();
 450                         spin_lock(&journal->j_state_lock);
 451                         spin_lock(&commit_transaction->t_handle_lock);
 452                 }
 453                 finish_wait(&journal->j_wait_updates, &wait);
 454         }
 455         spin_unlock(&commit_transaction->t_handle_lock);
 456
 457         J_ASSERT (commit_transaction->t_outstanding_credits <=
 458                         journal->j_max_transaction_buffers);
 459
 460         /*
 461          * First thing we are allowed to do is to discard any remaining
 462          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 463          * that there are no such buffers: if a large filesystem
 464          * operation like a truncate needs to split itself over multiple
 465          * transactions, then it may try to do a jbd2_journal_restart() while
 466          * there are still BJ_Reserved buffers outstanding.  These must
 467          * be released cleanly from the current transaction.
 468          *
 469          * In this case, the filesystem must still reserve write access
 470          * again before modifying the buffer in the new transaction, but
 471          * we do not require it to remember exactly which old buffers it
 472          * has reserved.  This is consistent with the existing behaviour
 473          * that multiple jbd2_journal_get_write_access() calls to the same
 474          * buffer are perfectly permissable.
 475          */
 476         while (commit_transaction->t_reserved_list) {
 477                 jh = commit_transaction->t_reserved_list;
 478                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 479                 /*
 480                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 481                  * leave undo-committed data.
 482                  */
 483                 if (jh->b_committed_data) {
 484                         struct buffer_head *bh = jh2bh(jh);
 485
 486                         jbd_lock_bh_state(bh);
 487                         jbd2_free(jh->b_committed_data, bh->b_size);
 488                         jh->b_committed_data = NULL;
 489                         jbd_unlock_bh_state(bh);
 490                 }
 491                 jbd2_journal_refile_buffer(journal, jh);
 492         }
 493
 494         /*
 495          * Now try to drop any written-back buffers from the journal's
 496          * checkpoint lists.  We do this *before* commit because it potentially
 497          * frees some memory
 498          */
 499         spin_lock(&journal->j_list_lock);
 500         __jbd2_journal_clean_checkpoint_list(journal);
 501         spin_unlock(&journal->j_list_lock);
 502
 503         jbd_debug (3, "JBD: commit phase 1\n");
 504
 505         /*
 506          * Switch to a new revoke table.
 507          */
 508         jbd2_journal_switch_revoke_table(journal);
 509
 510         stats.u.run.rs_flushing = jiffies;
 511         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 512                                                stats.u.run.rs_flushing);
 513
 514         commit_transaction->t_state = T_FLUSH;
 515         journal->j_committing_transaction = commit_transaction;
 516         journal->j_running_transaction = NULL;
 517         commit_transaction->t_log_start = journal->j_head;
 518         wake_up(&journal->j_wait_transaction_locked);
 519         spin_unlock(&journal->j_state_lock);
 520
 521         jbd_debug (3, "JBD: commit phase 2\n");
 522
 523         /*
 524          * Now start flushing things to disk, in the order they appear
 525          * on the transaction lists.  Data blocks go first.
 526          */
 527         err = 0;
 528         journal_submit_data_buffers(journal, commit_transaction);
 529
 530         /*
 531          * Wait for all previously submitted IO to complete if commit
 532          * record is to be written synchronously.
 533          */
 534         spin_lock(&journal->j_list_lock);
 535         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 536                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 537                 err = journal_wait_on_locked_list(journal,
 538                                                 commit_transaction);
 539
 540         spin_unlock(&journal->j_list_lock);
 541
 542         if (err)
 543                 jbd2_journal_abort(journal, err);
 544
 545         jbd2_journal_write_revoke_records(journal, commit_transaction);
 546
 547         jbd_debug(3, "JBD: commit phase 2\n");
 548
 549         /*
 550          * If we found any dirty or locked buffers, then we should have
 551          * looped back up to the write_out_data label.  If there weren't
 552          * any then journal_clean_data_list should have wiped the list
 553          * clean by now, so check that it is in fact empty.
 554          */
 555         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 556
 557         jbd_debug (3, "JBD: commit phase 3\n");
 558
 559         /*
 560          * Way to go: we have now written out all of the data for a
 561          * transaction!  Now comes the tricky part: we need to write out
 562          * metadata.  Loop over the transaction's entire buffer list:
 563          */
 564         spin_lock(&journal->j_state_lock);
 565         commit_transaction->t_state = T_COMMIT;
 566         spin_unlock(&journal->j_state_lock);
 567
 568         stats.u.run.rs_logging = jiffies;
 569         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 570                                                  stats.u.run.rs_logging);
 571         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 572         stats.u.run.rs_blocks_logged = 0;
 573
 574         J_ASSERT(commit_transaction->t_nr_buffers <=
 575                  commit_transaction->t_outstanding_credits);
 576
 577         descriptor = NULL;
 578         bufs = 0;
 579         while (commit_transaction->t_buffers) {
 580
 581                 /* Find the next buffer to be journaled... */
 582
 583                 jh = commit_transaction->t_buffers;
 584
 585                 /* If we're in abort mode, we just un-journal the buffer and
 586                    release it for background writing. */
 587
 588                 if (is_journal_aborted(journal)) {
 589                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 590                         jbd2_journal_refile_buffer(journal, jh);
 591                         /* If that was the last one, we need to clean up
 592                          * any descriptor buffers which may have been
 593                          * already allocated, even if we are now
 594                          * aborting. */
 595                         if (!commit_transaction->t_buffers)
 596                                 goto start_journal_io;
 597                         continue;
 598                 }
 599
 600                 /* Make sure we have a descriptor block in which to
 601                    record the metadata buffer. */
 602
 603                 if (!descriptor) {
 604                         struct buffer_head *bh;
 605
 606                         J_ASSERT (bufs == 0);
 607
 608                         jbd_debug(4, "JBD: get descriptor\n");
 609
 610                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 611                         if (!descriptor) {
 612                                 jbd2_journal_abort(journal, -EIO);
 613                                 continue;
 614                         }
 615
 616                         bh = jh2bh(descriptor);
 617                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 618                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 619                         header = (journal_header_t *)&bh->b_data[0];
 620                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 621                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 622                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 623
 624                         tagp = &bh->b_data[sizeof(journal_header_t)];
 625                         space_left = bh->b_size - sizeof(journal_header_t);
 626                         first_tag = 1;
 627                         set_buffer_jwrite(bh);
 628                         set_buffer_dirty(bh);
 629                         wbuf[bufs++] = bh;
 630
 631                         /* Record it so that we can wait for IO
 632                            completion later */
 633                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 634                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 635                                         BJ_LogCtl);
 636                 }
 637
 638                 /* Where is the buffer to be written? */
 639
 640                 err = jbd2_journal_next_log_block(journal, &blocknr);
 641                 /* If the block mapping failed, just abandon the buffer
 642                    and repeat this loop: we'll fall into the
 643                    refile-on-abort condition above. */
 644                 if (err) {
 645                         jbd2_journal_abort(journal, err);
 646                         continue;
 647                 }
 648
 649                 /*
 650                  * start_this_handle() uses t_outstanding_credits to determine
 651                  * the free space in the log, but this counter is changed
 652                  * by jbd2_journal_next_log_block() also.
 653                  */
 654                 commit_transaction->t_outstanding_credits--;
 655
 656                 /* Bump b_count to prevent truncate from stumbling over
 657                    the shadowed buffer!  @@@ This can go if we ever get
 658                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 659                 atomic_inc(&jh2bh(jh)->b_count);
 660
 661                 /* Make a temporary IO buffer with which to write it out
 662                    (this will requeue both the metadata buffer and the
 663                    temporary IO buffer). new_bh goes on BJ_IO*/
 664
 665                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 666                 /*
 667                  * akpm: jbd2_journal_write_metadata_buffer() sets
 668                  * new_bh->b_transaction to commit_transaction.
 669                  * We need to clean this up before we release new_bh
 670                  * (which is of type BJ_IO)
 671                  */
 672                 JBUFFER_TRACE(jh, "ph3: write metadata");
 673                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 674                                                       jh, &new_jh, blocknr);
 675                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 676                 wbuf[bufs++] = jh2bh(new_jh);
 677
 678                 /* Record the new block's tag in the current descriptor
 679                    buffer */
 680
 681                 tag_flag = 0;
 682                 if (flags & 1)
 683                         tag_flag |= JBD2_FLAG_ESCAPE;
 684                 if (!first_tag)
 685                         tag_flag |= JBD2_FLAG_SAME_UUID;
 686
 687                 tag = (journal_block_tag_t *) tagp;
 688                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 689                 tag->t_flags = cpu_to_be32(tag_flag);
 690                 tagp += tag_bytes;
 691                 space_left -= tag_bytes;
 692
 693                 if (first_tag) {
 694                         memcpy (tagp, journal->j_uuid, 16);
 695                         tagp += 16;
 696                         space_left -= 16;
 697                         first_tag = 0;
 698                 }
 699
 700                 /* If there's no more to do, or if the descriptor is full,
 701                    let the IO rip! */
 702
 703                 if (bufs == journal->j_wbufsize ||
 704                     commit_transaction->t_buffers == NULL ||
 705                     space_left < tag_bytes + 16) {
 706
 707                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 708
 709                         /* Write an end-of-descriptor marker before
 710                            submitting the IOs.  "tag" still points to
 711                            the last tag we set up. */
 712
 713                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 714
 715 start_journal_io:
 716                         for (i = 0; i < bufs; i++) {
 717                                 struct buffer_head *bh = wbuf[i];
 718                                 /*
 719                                  * Compute checksum.
 720                                  */
 721                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 722                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 723                                         crc32_sum =
 724                                             jbd2_checksum_data(crc32_sum, bh);
 725                                 }
 726
 727                                 lock_buffer(bh);
 728                                 clear_buffer_dirty(bh);
 729                                 set_buffer_uptodate(bh);
 730                                 bh->b_end_io = journal_end_buffer_io_sync;
 731                                 submit_bh(WRITE, bh);
 732                         }
 733                         cond_resched();
 734                         stats.u.run.rs_blocks_logged += bufs;
 735
 736                         /* Force a new descriptor to be generated next
 737                            time round the loop. */
 738                         descriptor = NULL;
 739                         bufs = 0;
 740                 }
 741         }
 742
 743         /* Done it all: now write the commit record asynchronously. */
 744
 745         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 746                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 747                 err = journal_submit_commit_record(journal, commit_transaction,
 748                                                  &cbh, crc32_sum);
 749                 if (err)
 750                         __jbd2_journal_abort_hard(journal);
 751
 752                 spin_lock(&journal->j_list_lock);
 753                 err = journal_wait_on_locked_list(journal,
 754                                                 commit_transaction);
 755                 spin_unlock(&journal->j_list_lock);
 756                 if (err)
 757                         __jbd2_journal_abort_hard(journal);
 758         }
 759
 760         /* Lo and behold: we have just managed to send a transaction to
 761            the log.  Before we can commit it, wait for the IO so far to
 762            complete.  Control buffers being written are on the
 763            transaction's t_log_list queue, and metadata buffers are on
 764            the t_iobuf_list queue.
 765
 766            Wait for the buffers in reverse order.  That way we are
 767            less likely to be woken up until all IOs have completed, and
 768            so we incur less scheduling load.
 769         */
 770
 771         jbd_debug(3, "JBD: commit phase 4\n");
 772
 773         /*
 774          * akpm: these are BJ_IO, and j_list_lock is not needed.
 775          * See __journal_try_to_free_buffer.
 776          */
 777 wait_for_iobuf:
 778         while (commit_transaction->t_iobuf_list != NULL) {
 779                 struct buffer_head *bh;
 780
 781                 jh = commit_transaction->t_iobuf_list->b_tprev;
 782                 bh = jh2bh(jh);
 783                 if (buffer_locked(bh)) {
 784                         wait_on_buffer(bh);
 785                         goto wait_for_iobuf;
 786                 }
 787                 if (cond_resched())
 788                         goto wait_for_iobuf;
 789
 790                 if (unlikely(!buffer_uptodate(bh)))
 791                         err = -EIO;
 792
 793                 clear_buffer_jwrite(bh);
 794
 795                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 796                 jbd2_journal_unfile_buffer(journal, jh);
 797
 798                 /*
 799                  * ->t_iobuf_list should contain only dummy buffer_heads
 800                  * which were created by jbd2_journal_write_metadata_buffer().
 801                  */
 802                 BUFFER_TRACE(bh, "dumping temporary bh");
 803                 jbd2_journal_put_journal_head(jh);
 804                 __brelse(bh);
 805                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 806                 free_buffer_head(bh);
 807
 808                 /* We also have to unlock and free the corresponding
 809                    shadowed buffer */
 810                 jh = commit_transaction->t_shadow_list->b_tprev;
 811                 bh = jh2bh(jh);
 812                 clear_bit(BH_JWrite, &bh->b_state);
 813                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 814
 815                 /* The metadata is now released for reuse, but we need
 816                    to remember it against this transaction so that when
 817                    we finally commit, we can do any checkpointing
 818                    required. */
 819                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 820                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 821                 /* Wake up any transactions which were waiting for this
 822                    IO to complete */
 823                 wake_up_bit(&bh->b_state, BH_Unshadow);
 824                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 825                 __brelse(bh);
 826         }
 827
 828         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 829
 830         jbd_debug(3, "JBD: commit phase 5\n");
 831
 832         /* Here we wait for the revoke record and descriptor record buffers */
 833  wait_for_ctlbuf:
 834         while (commit_transaction->t_log_list != NULL) {
 835                 struct buffer_head *bh;
 836
 837                 jh = commit_transaction->t_log_list->b_tprev;
 838                 bh = jh2bh(jh);
 839                 if (buffer_locked(bh)) {
 840                         wait_on_buffer(bh);
 841                         goto wait_for_ctlbuf;
 842                 }
 843                 if (cond_resched())
 844                         goto wait_for_ctlbuf;
 845
 846                 if (unlikely(!buffer_uptodate(bh)))
 847                         err = -EIO;
 848
 849                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 850                 clear_buffer_jwrite(bh);
 851                 jbd2_journal_unfile_buffer(journal, jh);
 852                 jbd2_journal_put_journal_head(jh);
 853                 __brelse(bh);           /* One for getblk */
 854                 /* AKPM: bforget here */
 855         }
 856
 857         jbd_debug(3, "JBD: commit phase 6\n");
 858
 859         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 860                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 861                 err = journal_submit_commit_record(journal, commit_transaction,
 862                                                 &cbh, crc32_sum);
 863                 if (err)
 864                         __jbd2_journal_abort_hard(journal);
 865         }
 866         if (!err && !is_journal_aborted(journal))
 867                 err = journal_wait_on_commit_record(cbh);
 868
 869         if (err)
 870                 jbd2_journal_abort(journal, err);
 871
 872         /* End of a transaction!  Finally, we can do checkpoint
 873            processing: any buffers committed as a result of this
 874            transaction can be removed from any checkpoint list it was on
 875            before. */
 876
 877         jbd_debug(3, "JBD: commit phase 7\n");
 878
 879         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 880         J_ASSERT(commit_transaction->t_buffers == NULL);
 881         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 882         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 883         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 884         J_ASSERT(commit_transaction->t_log_list == NULL);
 885
 886 restart_loop:
 887         /*
 888          * As there are other places (journal_unmap_buffer()) adding buffers
 889          * to this list we have to be careful and hold the j_list_lock.
 890          */
 891         spin_lock(&journal->j_list_lock);
 892         while (commit_transaction->t_forget) {
 893                 transaction_t *cp_transaction;
 894                 struct buffer_head *bh;
 895
 896                 jh = commit_transaction->t_forget;
 897                 spin_unlock(&journal->j_list_lock);
 898                 bh = jh2bh(jh);
 899                 jbd_lock_bh_state(bh);
 900                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 901                         jh->b_transaction == journal->j_running_transaction);
 902
 903                 /*
 904                  * If there is undo-protected committed data against
 905                  * this buffer, then we can remove it now.  If it is a
 906                  * buffer needing such protection, the old frozen_data
 907                  * field now points to a committed version of the
 908                  * buffer, so rotate that field to the new committed
 909                  * data.
 910                  *
 911                  * Otherwise, we can just throw away the frozen data now.
 912                  */
 913                 if (jh->b_committed_data) {
 914                         jbd2_free(jh->b_committed_data, bh->b_size);
 915                         jh->b_committed_data = NULL;
 916                         if (jh->b_frozen_data) {
 917                                 jh->b_committed_data = jh->b_frozen_data;
 918                                 jh->b_frozen_data = NULL;
 919                         }
 920                 } else if (jh->b_frozen_data) {
 921                         jbd2_free(jh->b_frozen_data, bh->b_size);
 922                         jh->b_frozen_data = NULL;
 923                 }
 924
 925                 spin_lock(&journal->j_list_lock);
 926                 cp_transaction = jh->b_cp_transaction;
 927                 if (cp_transaction) {
 928                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 929                         cp_transaction->t_chp_stats.cs_dropped++;
 930                         __jbd2_journal_remove_checkpoint(jh);
 931                 }
 932
 933                 /* Only re-checkpoint the buffer_head if it is marked
 934                  * dirty.  If the buffer was added to the BJ_Forget list
 935                  * by jbd2_journal_forget, it may no longer be dirty and
 936                  * there's no point in keeping a checkpoint record for
 937                  * it. */
 938
 939                 /* A buffer which has been freed while still being
 940                  * journaled by a previous transaction may end up still
 941                  * being dirty here, but we want to avoid writing back
 942                  * that buffer in the future now that the last use has
 943                  * been committed.  That's not only a performance gain,
 944                  * it also stops aliasing problems if the buffer is left
 945                  * behind for writeback and gets reallocated for another
 946                  * use in a different page. */
 947                 if (buffer_freed(bh)) {
 948                         clear_buffer_freed(bh);
 949                         clear_buffer_jbddirty(bh);
 950                 }
 951
 952                 if (buffer_jbddirty(bh)) {
 953                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 954                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 955                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 956                         __jbd2_journal_refile_buffer(jh);
 957                         jbd_unlock_bh_state(bh);
 958                 } else {
 959                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 960                         /* The buffer on BJ_Forget list and not jbddirty means
 961                          * it has been freed by this transaction and hence it
 962                          * could not have been reallocated until this
 963                          * transaction has committed. *BUT* it could be
 964                          * reallocated once we have written all the data to
 965                          * disk and before we process the buffer on BJ_Forget
 966                          * list. */
 967                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 968                         __jbd2_journal_refile_buffer(jh);
 969                         if (!jh->b_transaction) {
 970                                 jbd_unlock_bh_state(bh);
 971                                  /* needs a brelse */
 972                                 jbd2_journal_remove_journal_head(bh);
 973                                 release_buffer_page(bh);
 974                         } else
 975                                 jbd_unlock_bh_state(bh);
 976                 }
 977                 cond_resched_lock(&journal->j_list_lock);
 978         }
 979         spin_unlock(&journal->j_list_lock);
 980         /*
 981          * This is a bit sleazy.  We use j_list_lock to protect transition
 982          * of a transaction into T_FINISHED state and calling
 983          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 984          * other checkpointing code processing the transaction...
 985          */
 986         spin_lock(&journal->j_state_lock);
 987         spin_lock(&journal->j_list_lock);
 988         /*
 989          * Now recheck if some buffers did not get attached to the transaction
 990          * while the lock was dropped...
 991          */
 992         if (commit_transaction->t_forget) {
 993                 spin_unlock(&journal->j_list_lock);
 994                 spin_unlock(&journal->j_state_lock);
 995                 goto restart_loop;
 996         }
 997
 998         /* Done with this transaction! */
 999
1000         jbd_debug(3, "JBD: commit phase 8\n");
1001
1002         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1003
1004         commit_transaction->t_start = jiffies;
1005         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1006                                                 commit_transaction->t_start);
1007
1008         /*
1009          * File the transaction for history
1010          */
1011         stats.ts_type = JBD2_STATS_RUN;
1012         stats.ts_tid = commit_transaction->t_tid;
1013         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1014         spin_lock(&journal->j_history_lock);
1015         memcpy(journal->j_history + journal->j_history_cur, &stats,
1016                         sizeof(stats));
1017         if (++journal->j_history_cur == journal->j_history_max)
1018                 journal->j_history_cur = 0;
1019
1020         /*
1021          * Calculate overall stats
1022          */
1023         journal->j_stats.ts_tid++;
1024         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1025         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1026         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1027         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1028         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1029         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1030         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1031         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1032         spin_unlock(&journal->j_history_lock);
1033
1034         commit_transaction->t_state = T_FINISHED;
1035         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1036         journal->j_commit_sequence = commit_transaction->t_tid;
1037         journal->j_committing_transaction = NULL;
1038         spin_unlock(&journal->j_state_lock);
1039
1040         if (commit_transaction->t_checkpoint_list == NULL &&
1041             commit_transaction->t_checkpoint_io_list == NULL) {
1042                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1043         } else {
1044                 if (journal->j_checkpoint_transactions == NULL) {
1045                         journal->j_checkpoint_transactions = commit_transaction;
1046                         commit_transaction->t_cpnext = commit_transaction;
1047                         commit_transaction->t_cpprev = commit_transaction;
1048                 } else {
1049                         commit_transaction->t_cpnext =
1050                                 journal->j_checkpoint_transactions;
1051                         commit_transaction->t_cpprev =
1052                                 commit_transaction->t_cpnext->t_cpprev;
1053                         commit_transaction->t_cpnext->t_cpprev =
1054                                 commit_transaction;
1055                         commit_transaction->t_cpprev->t_cpnext =
1056                                 commit_transaction;
1057                 }
1058         }
1059         spin_unlock(&journal->j_list_lock);
1060
1061         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1062                   journal->j_commit_sequence, journal->j_tail_sequence);
1063
1064         wake_up(&journal->j_wait_done_commit);
1065 }