powerpc/eeh: Use slab to allocate eeh devices
[linux-2.6.git] / fs / jbd / commit.c
blob52c15c776029098546cc8c4d1309d4a156f651cf
1 /*
2 * linux/fs/jbd/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/mm.h>
21 #include <linux/pagemap.h>
22 #include <linux/bio.h>
23 #include <linux/blkdev.h>
24 #include <trace/events/jbd.h>
27 * Default IO end handler for temporary BJ_IO buffer_heads.
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
40 * When an ext3-ordered file is truncated, it is possible that many pages are
41 * not successfully freed, because they are attached to a committing transaction.
42 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
50 * Called under journal->j_list_lock. The caller provided us with a ref
51 * against the buffer, and we drop that here.
53 static void release_buffer_page(struct buffer_head *bh)
55 struct page *page;
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
67 /* OK, it's a truncated page */
68 if (!trylock_page(page))
69 goto nope;
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
78 nope:
79 __brelse(bh);
83 * Decrement reference counter for data buffer. If it has been marked
84 * 'BH_Freed', release it and the page to which it belongs if possible.
86 static void release_data_buffer(struct buffer_head *bh)
88 if (buffer_freed(bh)) {
89 clear_buffer_freed(bh);
90 release_buffer_page(bh);
91 } else
92 put_bh(bh);
96 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
97 * held. For ranking reasons we must trylock. If we lose, schedule away and
98 * return 0. j_list_lock is dropped in this case.
100 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
102 if (!jbd_trylock_bh_state(bh)) {
103 spin_unlock(&journal->j_list_lock);
104 schedule();
105 return 0;
107 return 1;
110 /* Done it all: now write the commit record. We should have
111 * cleaned up our previous buffers by now, so if we are in abort
112 * mode we can now just skip the rest of the journal write
113 * entirely.
115 * Returns 1 if the journal needs to be aborted or 0 on success
117 static int journal_write_commit_record(journal_t *journal,
118 transaction_t *commit_transaction)
120 struct journal_head *descriptor;
121 struct buffer_head *bh;
122 journal_header_t *header;
123 int ret;
125 if (is_journal_aborted(journal))
126 return 0;
128 descriptor = journal_get_descriptor_buffer(journal);
129 if (!descriptor)
130 return 1;
132 bh = jh2bh(descriptor);
134 header = (journal_header_t *)(bh->b_data);
135 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136 header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
139 JBUFFER_TRACE(descriptor, "write commit block");
140 set_buffer_dirty(bh);
142 if (journal->j_flags & JFS_BARRIER)
143 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
144 else
145 ret = sync_dirty_buffer(bh);
147 put_bh(bh); /* One for getblk() */
148 journal_put_journal_head(descriptor);
150 return (ret == -EIO);
153 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
154 int write_op)
156 int i;
158 for (i = 0; i < bufs; i++) {
159 wbuf[i]->b_end_io = end_buffer_write_sync;
160 /* We use-up our safety reference in submit_bh() */
161 submit_bh(write_op, wbuf[i]);
166 * Submit all the data buffers to disk
168 static int journal_submit_data_buffers(journal_t *journal,
169 transaction_t *commit_transaction,
170 int write_op)
172 struct journal_head *jh;
173 struct buffer_head *bh;
174 int locked;
175 int bufs = 0;
176 struct buffer_head **wbuf = journal->j_wbuf;
177 int err = 0;
180 * Whenever we unlock the journal and sleep, things can get added
181 * onto ->t_sync_datalist, so we have to keep looping back to
182 * write_out_data until we *know* that the list is empty.
184 * Cleanup any flushed data buffers from the data list. Even in
185 * abort mode, we want to flush this out as soon as possible.
187 write_out_data:
188 cond_resched();
189 spin_lock(&journal->j_list_lock);
191 while (commit_transaction->t_sync_datalist) {
192 jh = commit_transaction->t_sync_datalist;
193 bh = jh2bh(jh);
194 locked = 0;
196 /* Get reference just to make sure buffer does not disappear
197 * when we are forced to drop various locks */
198 get_bh(bh);
199 /* If the buffer is dirty, we need to submit IO and hence
200 * we need the buffer lock. We try to lock the buffer without
201 * blocking. If we fail, we need to drop j_list_lock and do
202 * blocking lock_buffer().
204 if (buffer_dirty(bh)) {
205 if (!trylock_buffer(bh)) {
206 BUFFER_TRACE(bh, "needs blocking lock");
207 spin_unlock(&journal->j_list_lock);
208 trace_jbd_do_submit_data(journal,
209 commit_transaction);
210 /* Write out all data to prevent deadlocks */
211 journal_do_submit_data(wbuf, bufs, write_op);
212 bufs = 0;
213 lock_buffer(bh);
214 spin_lock(&journal->j_list_lock);
216 locked = 1;
218 /* We have to get bh_state lock. Again out of order, sigh. */
219 if (!inverted_lock(journal, bh)) {
220 jbd_lock_bh_state(bh);
221 spin_lock(&journal->j_list_lock);
223 /* Someone already cleaned up the buffer? */
224 if (!buffer_jbd(bh) || bh2jh(bh) != jh
225 || jh->b_transaction != commit_transaction
226 || jh->b_jlist != BJ_SyncData) {
227 jbd_unlock_bh_state(bh);
228 if (locked)
229 unlock_buffer(bh);
230 BUFFER_TRACE(bh, "already cleaned up");
231 release_data_buffer(bh);
232 continue;
234 if (locked && test_clear_buffer_dirty(bh)) {
235 BUFFER_TRACE(bh, "needs writeout, adding to array");
236 wbuf[bufs++] = bh;
237 __journal_file_buffer(jh, commit_transaction,
238 BJ_Locked);
239 jbd_unlock_bh_state(bh);
240 if (bufs == journal->j_wbufsize) {
241 spin_unlock(&journal->j_list_lock);
242 trace_jbd_do_submit_data(journal,
243 commit_transaction);
244 journal_do_submit_data(wbuf, bufs, write_op);
245 bufs = 0;
246 goto write_out_data;
248 } else if (!locked && buffer_locked(bh)) {
249 __journal_file_buffer(jh, commit_transaction,
250 BJ_Locked);
251 jbd_unlock_bh_state(bh);
252 put_bh(bh);
253 } else {
254 BUFFER_TRACE(bh, "writeout complete: unfile");
255 if (unlikely(!buffer_uptodate(bh)))
256 err = -EIO;
257 __journal_unfile_buffer(jh);
258 jbd_unlock_bh_state(bh);
259 if (locked)
260 unlock_buffer(bh);
261 release_data_buffer(bh);
264 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
265 spin_unlock(&journal->j_list_lock);
266 goto write_out_data;
269 spin_unlock(&journal->j_list_lock);
270 trace_jbd_do_submit_data(journal, commit_transaction);
271 journal_do_submit_data(wbuf, bufs, write_op);
273 return err;
277 * journal_commit_transaction
279 * The primary function for committing a transaction to the log. This
280 * function is called by the journal thread to begin a complete commit.
282 void journal_commit_transaction(journal_t *journal)
284 transaction_t *commit_transaction;
285 struct journal_head *jh, *new_jh, *descriptor;
286 struct buffer_head **wbuf = journal->j_wbuf;
287 int bufs;
288 int flags;
289 int err;
290 unsigned int blocknr;
291 ktime_t start_time;
292 u64 commit_time;
293 char *tagp = NULL;
294 journal_header_t *header;
295 journal_block_tag_t *tag = NULL;
296 int space_left = 0;
297 int first_tag = 0;
298 int tag_flag;
299 int i;
300 struct blk_plug plug;
301 int write_op = WRITE;
304 * First job: lock down the current transaction and wait for
305 * all outstanding updates to complete.
308 /* Do we need to erase the effects of a prior journal_flush? */
309 if (journal->j_flags & JFS_FLUSHED) {
310 jbd_debug(3, "super block updated\n");
311 mutex_lock(&journal->j_checkpoint_mutex);
313 * We hold j_checkpoint_mutex so tail cannot change under us.
314 * We don't need any special data guarantees for writing sb
315 * since journal is empty and it is ok for write to be
316 * flushed only with transaction commit.
318 journal_update_sb_log_tail(journal, journal->j_tail_sequence,
319 journal->j_tail, WRITE_SYNC);
320 mutex_unlock(&journal->j_checkpoint_mutex);
321 } else {
322 jbd_debug(3, "superblock not updated\n");
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
331 trace_jbd_start_commit(journal, commit_transaction);
332 jbd_debug(1, "JBD: starting commit of transaction %d\n",
333 commit_transaction->t_tid);
335 spin_lock(&journal->j_state_lock);
336 commit_transaction->t_state = T_LOCKED;
338 trace_jbd_commit_locking(journal, commit_transaction);
339 spin_lock(&commit_transaction->t_handle_lock);
340 while (commit_transaction->t_updates) {
341 DEFINE_WAIT(wait);
343 prepare_to_wait(&journal->j_wait_updates, &wait,
344 TASK_UNINTERRUPTIBLE);
345 if (commit_transaction->t_updates) {
346 spin_unlock(&commit_transaction->t_handle_lock);
347 spin_unlock(&journal->j_state_lock);
348 schedule();
349 spin_lock(&journal->j_state_lock);
350 spin_lock(&commit_transaction->t_handle_lock);
352 finish_wait(&journal->j_wait_updates, &wait);
354 spin_unlock(&commit_transaction->t_handle_lock);
356 J_ASSERT (commit_transaction->t_outstanding_credits <=
357 journal->j_max_transaction_buffers);
360 * First thing we are allowed to do is to discard any remaining
361 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
362 * that there are no such buffers: if a large filesystem
363 * operation like a truncate needs to split itself over multiple
364 * transactions, then it may try to do a journal_restart() while
365 * there are still BJ_Reserved buffers outstanding. These must
366 * be released cleanly from the current transaction.
368 * In this case, the filesystem must still reserve write access
369 * again before modifying the buffer in the new transaction, but
370 * we do not require it to remember exactly which old buffers it
371 * has reserved. This is consistent with the existing behaviour
372 * that multiple journal_get_write_access() calls to the same
373 * buffer are perfectly permissible.
375 while (commit_transaction->t_reserved_list) {
376 jh = commit_transaction->t_reserved_list;
377 JBUFFER_TRACE(jh, "reserved, unused: refile");
379 * A journal_get_undo_access()+journal_release_buffer() may
380 * leave undo-committed data.
382 if (jh->b_committed_data) {
383 struct buffer_head *bh = jh2bh(jh);
385 jbd_lock_bh_state(bh);
386 jbd_free(jh->b_committed_data, bh->b_size);
387 jh->b_committed_data = NULL;
388 jbd_unlock_bh_state(bh);
390 journal_refile_buffer(journal, jh);
394 * Now try to drop any written-back buffers from the journal's
395 * checkpoint lists. We do this *before* commit because it potentially
396 * frees some memory
398 spin_lock(&journal->j_list_lock);
399 __journal_clean_checkpoint_list(journal);
400 spin_unlock(&journal->j_list_lock);
402 jbd_debug (3, "JBD: commit phase 1\n");
405 * Clear revoked flag to reflect there is no revoked buffers
406 * in the next transaction which is going to be started.
408 journal_clear_buffer_revoked_flags(journal);
411 * Switch to a new revoke table.
413 journal_switch_revoke_table(journal);
415 trace_jbd_commit_flushing(journal, commit_transaction);
416 commit_transaction->t_state = T_FLUSH;
417 journal->j_committing_transaction = commit_transaction;
418 journal->j_running_transaction = NULL;
419 start_time = ktime_get();
420 commit_transaction->t_log_start = journal->j_head;
421 wake_up(&journal->j_wait_transaction_locked);
422 spin_unlock(&journal->j_state_lock);
424 jbd_debug (3, "JBD: commit phase 2\n");
426 if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
427 write_op = WRITE_SYNC;
430 * Now start flushing things to disk, in the order they appear
431 * on the transaction lists. Data blocks go first.
433 blk_start_plug(&plug);
434 err = journal_submit_data_buffers(journal, commit_transaction,
435 write_op);
436 blk_finish_plug(&plug);
439 * Wait for all previously submitted IO to complete.
441 spin_lock(&journal->j_list_lock);
442 while (commit_transaction->t_locked_list) {
443 struct buffer_head *bh;
445 jh = commit_transaction->t_locked_list->b_tprev;
446 bh = jh2bh(jh);
447 get_bh(bh);
448 if (buffer_locked(bh)) {
449 spin_unlock(&journal->j_list_lock);
450 wait_on_buffer(bh);
451 spin_lock(&journal->j_list_lock);
453 if (unlikely(!buffer_uptodate(bh))) {
454 if (!trylock_page(bh->b_page)) {
455 spin_unlock(&journal->j_list_lock);
456 lock_page(bh->b_page);
457 spin_lock(&journal->j_list_lock);
459 if (bh->b_page->mapping)
460 set_bit(AS_EIO, &bh->b_page->mapping->flags);
462 unlock_page(bh->b_page);
463 SetPageError(bh->b_page);
464 err = -EIO;
466 if (!inverted_lock(journal, bh)) {
467 put_bh(bh);
468 spin_lock(&journal->j_list_lock);
469 continue;
471 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
472 jh->b_transaction == commit_transaction &&
473 jh->b_jlist == BJ_Locked)
474 __journal_unfile_buffer(jh);
475 jbd_unlock_bh_state(bh);
476 release_data_buffer(bh);
477 cond_resched_lock(&journal->j_list_lock);
479 spin_unlock(&journal->j_list_lock);
481 if (err) {
482 char b[BDEVNAME_SIZE];
484 printk(KERN_WARNING
485 "JBD: Detected IO errors while flushing file data "
486 "on %s\n", bdevname(journal->j_fs_dev, b));
487 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
488 journal_abort(journal, err);
489 err = 0;
492 blk_start_plug(&plug);
494 journal_write_revoke_records(journal, commit_transaction, write_op);
497 * If we found any dirty or locked buffers, then we should have
498 * looped back up to the write_out_data label. If there weren't
499 * any then journal_clean_data_list should have wiped the list
500 * clean by now, so check that it is in fact empty.
502 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
504 jbd_debug (3, "JBD: commit phase 3\n");
507 * Way to go: we have now written out all of the data for a
508 * transaction! Now comes the tricky part: we need to write out
509 * metadata. Loop over the transaction's entire buffer list:
511 spin_lock(&journal->j_state_lock);
512 commit_transaction->t_state = T_COMMIT;
513 spin_unlock(&journal->j_state_lock);
515 trace_jbd_commit_logging(journal, commit_transaction);
516 J_ASSERT(commit_transaction->t_nr_buffers <=
517 commit_transaction->t_outstanding_credits);
519 descriptor = NULL;
520 bufs = 0;
521 while (commit_transaction->t_buffers) {
523 /* Find the next buffer to be journaled... */
525 jh = commit_transaction->t_buffers;
527 /* If we're in abort mode, we just un-journal the buffer and
528 release it. */
530 if (is_journal_aborted(journal)) {
531 clear_buffer_jbddirty(jh2bh(jh));
532 JBUFFER_TRACE(jh, "journal is aborting: refile");
533 journal_refile_buffer(journal, jh);
534 /* If that was the last one, we need to clean up
535 * any descriptor buffers which may have been
536 * already allocated, even if we are now
537 * aborting. */
538 if (!commit_transaction->t_buffers)
539 goto start_journal_io;
540 continue;
543 /* Make sure we have a descriptor block in which to
544 record the metadata buffer. */
546 if (!descriptor) {
547 struct buffer_head *bh;
549 J_ASSERT (bufs == 0);
551 jbd_debug(4, "JBD: get descriptor\n");
553 descriptor = journal_get_descriptor_buffer(journal);
554 if (!descriptor) {
555 journal_abort(journal, -EIO);
556 continue;
559 bh = jh2bh(descriptor);
560 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
561 (unsigned long long)bh->b_blocknr, bh->b_data);
562 header = (journal_header_t *)&bh->b_data[0];
563 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
564 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
565 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
567 tagp = &bh->b_data[sizeof(journal_header_t)];
568 space_left = bh->b_size - sizeof(journal_header_t);
569 first_tag = 1;
570 set_buffer_jwrite(bh);
571 set_buffer_dirty(bh);
572 wbuf[bufs++] = bh;
574 /* Record it so that we can wait for IO
575 completion later */
576 BUFFER_TRACE(bh, "ph3: file as descriptor");
577 journal_file_buffer(descriptor, commit_transaction,
578 BJ_LogCtl);
581 /* Where is the buffer to be written? */
583 err = journal_next_log_block(journal, &blocknr);
584 /* If the block mapping failed, just abandon the buffer
585 and repeat this loop: we'll fall into the
586 refile-on-abort condition above. */
587 if (err) {
588 journal_abort(journal, err);
589 continue;
593 * start_this_handle() uses t_outstanding_credits to determine
594 * the free space in the log, but this counter is changed
595 * by journal_next_log_block() also.
597 commit_transaction->t_outstanding_credits--;
599 /* Bump b_count to prevent truncate from stumbling over
600 the shadowed buffer! @@@ This can go if we ever get
601 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
602 get_bh(jh2bh(jh));
604 /* Make a temporary IO buffer with which to write it out
605 (this will requeue both the metadata buffer and the
606 temporary IO buffer). new_bh goes on BJ_IO*/
608 set_buffer_jwrite(jh2bh(jh));
610 * akpm: journal_write_metadata_buffer() sets
611 * new_bh->b_transaction to commit_transaction.
612 * We need to clean this up before we release new_bh
613 * (which is of type BJ_IO)
615 JBUFFER_TRACE(jh, "ph3: write metadata");
616 flags = journal_write_metadata_buffer(commit_transaction,
617 jh, &new_jh, blocknr);
618 set_buffer_jwrite(jh2bh(new_jh));
619 wbuf[bufs++] = jh2bh(new_jh);
621 /* Record the new block's tag in the current descriptor
622 buffer */
624 tag_flag = 0;
625 if (flags & 1)
626 tag_flag |= JFS_FLAG_ESCAPE;
627 if (!first_tag)
628 tag_flag |= JFS_FLAG_SAME_UUID;
630 tag = (journal_block_tag_t *) tagp;
631 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
632 tag->t_flags = cpu_to_be32(tag_flag);
633 tagp += sizeof(journal_block_tag_t);
634 space_left -= sizeof(journal_block_tag_t);
636 if (first_tag) {
637 memcpy (tagp, journal->j_uuid, 16);
638 tagp += 16;
639 space_left -= 16;
640 first_tag = 0;
643 /* If there's no more to do, or if the descriptor is full,
644 let the IO rip! */
646 if (bufs == journal->j_wbufsize ||
647 commit_transaction->t_buffers == NULL ||
648 space_left < sizeof(journal_block_tag_t) + 16) {
650 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
652 /* Write an end-of-descriptor marker before
653 submitting the IOs. "tag" still points to
654 the last tag we set up. */
656 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
658 start_journal_io:
659 for (i = 0; i < bufs; i++) {
660 struct buffer_head *bh = wbuf[i];
661 lock_buffer(bh);
662 clear_buffer_dirty(bh);
663 set_buffer_uptodate(bh);
664 bh->b_end_io = journal_end_buffer_io_sync;
665 submit_bh(write_op, bh);
667 cond_resched();
669 /* Force a new descriptor to be generated next
670 time round the loop. */
671 descriptor = NULL;
672 bufs = 0;
676 blk_finish_plug(&plug);
678 /* Lo and behold: we have just managed to send a transaction to
679 the log. Before we can commit it, wait for the IO so far to
680 complete. Control buffers being written are on the
681 transaction's t_log_list queue, and metadata buffers are on
682 the t_iobuf_list queue.
684 Wait for the buffers in reverse order. That way we are
685 less likely to be woken up until all IOs have completed, and
686 so we incur less scheduling load.
689 jbd_debug(3, "JBD: commit phase 4\n");
692 * akpm: these are BJ_IO, and j_list_lock is not needed.
693 * See __journal_try_to_free_buffer.
695 wait_for_iobuf:
696 while (commit_transaction->t_iobuf_list != NULL) {
697 struct buffer_head *bh;
699 jh = commit_transaction->t_iobuf_list->b_tprev;
700 bh = jh2bh(jh);
701 if (buffer_locked(bh)) {
702 wait_on_buffer(bh);
703 goto wait_for_iobuf;
705 if (cond_resched())
706 goto wait_for_iobuf;
708 if (unlikely(!buffer_uptodate(bh)))
709 err = -EIO;
711 clear_buffer_jwrite(bh);
713 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
714 journal_unfile_buffer(journal, jh);
717 * ->t_iobuf_list should contain only dummy buffer_heads
718 * which were created by journal_write_metadata_buffer().
720 BUFFER_TRACE(bh, "dumping temporary bh");
721 journal_put_journal_head(jh);
722 __brelse(bh);
723 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
724 free_buffer_head(bh);
726 /* We also have to unlock and free the corresponding
727 shadowed buffer */
728 jh = commit_transaction->t_shadow_list->b_tprev;
729 bh = jh2bh(jh);
730 clear_buffer_jwrite(bh);
731 J_ASSERT_BH(bh, buffer_jbddirty(bh));
733 /* The metadata is now released for reuse, but we need
734 to remember it against this transaction so that when
735 we finally commit, we can do any checkpointing
736 required. */
737 JBUFFER_TRACE(jh, "file as BJ_Forget");
738 journal_file_buffer(jh, commit_transaction, BJ_Forget);
740 * Wake up any transactions which were waiting for this
741 * IO to complete. The barrier must be here so that changes
742 * by journal_file_buffer() take effect before wake_up_bit()
743 * does the waitqueue check.
745 smp_mb();
746 wake_up_bit(&bh->b_state, BH_Unshadow);
747 JBUFFER_TRACE(jh, "brelse shadowed buffer");
748 __brelse(bh);
751 J_ASSERT (commit_transaction->t_shadow_list == NULL);
753 jbd_debug(3, "JBD: commit phase 5\n");
755 /* Here we wait for the revoke record and descriptor record buffers */
756 wait_for_ctlbuf:
757 while (commit_transaction->t_log_list != NULL) {
758 struct buffer_head *bh;
760 jh = commit_transaction->t_log_list->b_tprev;
761 bh = jh2bh(jh);
762 if (buffer_locked(bh)) {
763 wait_on_buffer(bh);
764 goto wait_for_ctlbuf;
766 if (cond_resched())
767 goto wait_for_ctlbuf;
769 if (unlikely(!buffer_uptodate(bh)))
770 err = -EIO;
772 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
773 clear_buffer_jwrite(bh);
774 journal_unfile_buffer(journal, jh);
775 journal_put_journal_head(jh);
776 __brelse(bh); /* One for getblk */
777 /* AKPM: bforget here */
780 if (err)
781 journal_abort(journal, err);
783 jbd_debug(3, "JBD: commit phase 6\n");
785 /* All metadata is written, now write commit record and do cleanup */
786 spin_lock(&journal->j_state_lock);
787 J_ASSERT(commit_transaction->t_state == T_COMMIT);
788 commit_transaction->t_state = T_COMMIT_RECORD;
789 spin_unlock(&journal->j_state_lock);
791 if (journal_write_commit_record(journal, commit_transaction))
792 err = -EIO;
794 if (err)
795 journal_abort(journal, err);
797 /* End of a transaction! Finally, we can do checkpoint
798 processing: any buffers committed as a result of this
799 transaction can be removed from any checkpoint list it was on
800 before. */
802 jbd_debug(3, "JBD: commit phase 7\n");
804 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
805 J_ASSERT(commit_transaction->t_buffers == NULL);
806 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
807 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
808 J_ASSERT(commit_transaction->t_shadow_list == NULL);
809 J_ASSERT(commit_transaction->t_log_list == NULL);
811 restart_loop:
813 * As there are other places (journal_unmap_buffer()) adding buffers
814 * to this list we have to be careful and hold the j_list_lock.
816 spin_lock(&journal->j_list_lock);
817 while (commit_transaction->t_forget) {
818 transaction_t *cp_transaction;
819 struct buffer_head *bh;
820 int try_to_free = 0;
822 jh = commit_transaction->t_forget;
823 spin_unlock(&journal->j_list_lock);
824 bh = jh2bh(jh);
826 * Get a reference so that bh cannot be freed before we are
827 * done with it.
829 get_bh(bh);
830 jbd_lock_bh_state(bh);
831 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
832 jh->b_transaction == journal->j_running_transaction);
835 * If there is undo-protected committed data against
836 * this buffer, then we can remove it now. If it is a
837 * buffer needing such protection, the old frozen_data
838 * field now points to a committed version of the
839 * buffer, so rotate that field to the new committed
840 * data.
842 * Otherwise, we can just throw away the frozen data now.
844 if (jh->b_committed_data) {
845 jbd_free(jh->b_committed_data, bh->b_size);
846 jh->b_committed_data = NULL;
847 if (jh->b_frozen_data) {
848 jh->b_committed_data = jh->b_frozen_data;
849 jh->b_frozen_data = NULL;
851 } else if (jh->b_frozen_data) {
852 jbd_free(jh->b_frozen_data, bh->b_size);
853 jh->b_frozen_data = NULL;
856 spin_lock(&journal->j_list_lock);
857 cp_transaction = jh->b_cp_transaction;
858 if (cp_transaction) {
859 JBUFFER_TRACE(jh, "remove from old cp transaction");
860 __journal_remove_checkpoint(jh);
863 /* Only re-checkpoint the buffer_head if it is marked
864 * dirty. If the buffer was added to the BJ_Forget list
865 * by journal_forget, it may no longer be dirty and
866 * there's no point in keeping a checkpoint record for
867 * it. */
869 /* A buffer which has been freed while still being
870 * journaled by a previous transaction may end up still
871 * being dirty here, but we want to avoid writing back
872 * that buffer in the future after the "add to orphan"
873 * operation been committed, That's not only a performance
874 * gain, it also stops aliasing problems if the buffer is
875 * left behind for writeback and gets reallocated for another
876 * use in a different page. */
877 if (buffer_freed(bh) && !jh->b_next_transaction) {
878 clear_buffer_freed(bh);
879 clear_buffer_jbddirty(bh);
882 if (buffer_jbddirty(bh)) {
883 JBUFFER_TRACE(jh, "add to new checkpointing trans");
884 __journal_insert_checkpoint(jh, commit_transaction);
885 if (is_journal_aborted(journal))
886 clear_buffer_jbddirty(bh);
887 } else {
888 J_ASSERT_BH(bh, !buffer_dirty(bh));
890 * The buffer on BJ_Forget list and not jbddirty means
891 * it has been freed by this transaction and hence it
892 * could not have been reallocated until this
893 * transaction has committed. *BUT* it could be
894 * reallocated once we have written all the data to
895 * disk and before we process the buffer on BJ_Forget
896 * list.
898 if (!jh->b_next_transaction)
899 try_to_free = 1;
901 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
902 __journal_refile_buffer(jh);
903 jbd_unlock_bh_state(bh);
904 if (try_to_free)
905 release_buffer_page(bh);
906 else
907 __brelse(bh);
908 cond_resched_lock(&journal->j_list_lock);
910 spin_unlock(&journal->j_list_lock);
912 * This is a bit sleazy. We use j_list_lock to protect transition
913 * of a transaction into T_FINISHED state and calling
914 * __journal_drop_transaction(). Otherwise we could race with
915 * other checkpointing code processing the transaction...
917 spin_lock(&journal->j_state_lock);
918 spin_lock(&journal->j_list_lock);
920 * Now recheck if some buffers did not get attached to the transaction
921 * while the lock was dropped...
923 if (commit_transaction->t_forget) {
924 spin_unlock(&journal->j_list_lock);
925 spin_unlock(&journal->j_state_lock);
926 goto restart_loop;
929 /* Done with this transaction! */
931 jbd_debug(3, "JBD: commit phase 8\n");
933 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
935 commit_transaction->t_state = T_FINISHED;
936 J_ASSERT(commit_transaction == journal->j_committing_transaction);
937 journal->j_commit_sequence = commit_transaction->t_tid;
938 journal->j_committing_transaction = NULL;
939 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
942 * weight the commit time higher than the average time so we don't
943 * react too strongly to vast changes in commit time
945 if (likely(journal->j_average_commit_time))
946 journal->j_average_commit_time = (commit_time*3 +
947 journal->j_average_commit_time) / 4;
948 else
949 journal->j_average_commit_time = commit_time;
951 spin_unlock(&journal->j_state_lock);
953 if (commit_transaction->t_checkpoint_list == NULL &&
954 commit_transaction->t_checkpoint_io_list == NULL) {
955 __journal_drop_transaction(journal, commit_transaction);
956 } else {
957 if (journal->j_checkpoint_transactions == NULL) {
958 journal->j_checkpoint_transactions = commit_transaction;
959 commit_transaction->t_cpnext = commit_transaction;
960 commit_transaction->t_cpprev = commit_transaction;
961 } else {
962 commit_transaction->t_cpnext =
963 journal->j_checkpoint_transactions;
964 commit_transaction->t_cpprev =
965 commit_transaction->t_cpnext->t_cpprev;
966 commit_transaction->t_cpnext->t_cpprev =
967 commit_transaction;
968 commit_transaction->t_cpprev->t_cpnext =
969 commit_transaction;
972 spin_unlock(&journal->j_list_lock);
974 trace_jbd_end_commit(journal, commit_transaction);
975 jbd_debug(1, "JBD: commit %d complete, head %d\n",
976 journal->j_commit_sequence, journal->j_tail_sequence);
978 wake_up(&journal->j_wait_done_commit);