jbd2: Fix buffer head leak when writing the commit block
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / fs / jbd2 / commit.c
blob6caf22d7737d0a850e2a3aaa7e5a8378900ed841
1 /*
2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
29 * Default IO end handler for temporary BJ_IO buffer_heads.
31 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33 BUFFER_TRACE(bh, "");
34 if (uptodate)
35 set_buffer_uptodate(bh);
36 else
37 clear_buffer_uptodate(bh);
38 unlock_buffer(bh);
42 * When an ext4 file is truncated, it is possible that some pages are not
43 * successfully freed, because they are attached to a committing transaction.
44 * After the transaction commits, these pages are left on the LRU, with no
45 * ->mapping, and with attached buffers. These pages are trivially reclaimable
46 * by the VM, but their apparent absence upsets the VM accounting, and it makes
47 * the numbers in /proc/meminfo look odd.
49 * So here, we have a buffer which has just come off the forget list. Look to
50 * see if we can strip all buffers from the backing page.
52 * Called under lock_journal(), and possibly under journal_datalist_lock. The
53 * caller provided us with a ref against the buffer, and we drop that here.
55 static void release_buffer_page(struct buffer_head *bh)
57 struct page *page;
59 if (buffer_dirty(bh))
60 goto nope;
61 if (atomic_read(&bh->b_count) != 1)
62 goto nope;
63 page = bh->b_page;
64 if (!page)
65 goto nope;
66 if (page->mapping)
67 goto nope;
69 /* OK, it's a truncated page */
70 if (!trylock_page(page))
71 goto nope;
73 page_cache_get(page);
74 __brelse(bh);
75 try_to_free_buffers(page);
76 unlock_page(page);
77 page_cache_release(page);
78 return;
80 nope:
81 __brelse(bh);
85 * Done it all: now submit the commit record. We should have
86 * cleaned up our previous buffers by now, so if we are in abort
87 * mode we can now just skip the rest of the journal write
88 * entirely.
90 * Returns 1 if the journal needs to be aborted or 0 on success
92 static int journal_submit_commit_record(journal_t *journal,
93 transaction_t *commit_transaction,
94 struct buffer_head **cbh,
95 __u32 crc32_sum)
97 struct journal_head *descriptor;
98 struct commit_header *tmp;
99 struct buffer_head *bh;
100 int ret;
101 int barrier_done = 0;
102 struct timespec now = current_kernel_time();
104 if (is_journal_aborted(journal))
105 return 0;
107 descriptor = jbd2_journal_get_descriptor_buffer(journal);
108 if (!descriptor)
109 return 1;
111 bh = jh2bh(descriptor);
113 tmp = (struct commit_header *)bh->b_data;
114 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
115 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
116 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
117 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
118 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
120 if (JBD2_HAS_COMPAT_FEATURE(journal,
121 JBD2_FEATURE_COMPAT_CHECKSUM)) {
122 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
123 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
124 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
127 JBUFFER_TRACE(descriptor, "submit commit block");
128 lock_buffer(bh);
129 clear_buffer_dirty(bh);
130 set_buffer_uptodate(bh);
131 bh->b_end_io = journal_end_buffer_io_sync;
133 if (journal->j_flags & JBD2_BARRIER &&
134 !JBD2_HAS_INCOMPAT_FEATURE(journal,
135 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
136 set_buffer_ordered(bh);
137 barrier_done = 1;
139 ret = submit_bh(WRITE, bh);
140 if (barrier_done)
141 clear_buffer_ordered(bh);
143 /* is it possible for another commit to fail at roughly
144 * the same time as this one? If so, we don't want to
145 * trust the barrier flag in the super, but instead want
146 * to remember if we sent a barrier request
148 if (ret == -EOPNOTSUPP && barrier_done) {
149 char b[BDEVNAME_SIZE];
151 printk(KERN_WARNING
152 "JBD: barrier-based sync failed on %s - "
153 "disabling barriers\n",
154 bdevname(journal->j_dev, b));
155 spin_lock(&journal->j_state_lock);
156 journal->j_flags &= ~JBD2_BARRIER;
157 spin_unlock(&journal->j_state_lock);
159 /* And try again, without the barrier */
160 lock_buffer(bh);
161 set_buffer_uptodate(bh);
162 clear_buffer_dirty(bh);
163 ret = submit_bh(WRITE, bh);
165 *cbh = bh;
166 return ret;
170 * This function along with journal_submit_commit_record
171 * allows to write the commit record asynchronously.
173 static int journal_wait_on_commit_record(struct buffer_head *bh)
175 int ret = 0;
177 clear_buffer_dirty(bh);
178 wait_on_buffer(bh);
180 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO;
182 put_bh(bh); /* One for getblk() */
183 jbd2_journal_put_journal_head(bh2jh(bh));
185 return ret;
189 * write the filemap data using writepage() address_space_operations.
190 * We don't do block allocation here even for delalloc. We don't
191 * use writepages() because with dealyed allocation we may be doing
192 * block allocation in writepages().
194 static int journal_submit_inode_data_buffers(struct address_space *mapping)
196 int ret;
197 struct writeback_control wbc = {
198 .sync_mode = WB_SYNC_ALL,
199 .nr_to_write = mapping->nrpages * 2,
200 .range_start = 0,
201 .range_end = i_size_read(mapping->host),
202 .for_writepages = 1,
205 ret = generic_writepages(mapping, &wbc);
206 return ret;
210 * Submit all the data buffers of inode associated with the transaction to
211 * disk.
213 * We are in a committing transaction. Therefore no new inode can be added to
214 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
215 * operate on from being released while we write out pages.
217 static int journal_submit_data_buffers(journal_t *journal,
218 transaction_t *commit_transaction)
220 struct jbd2_inode *jinode;
221 int err, ret = 0;
222 struct address_space *mapping;
224 spin_lock(&journal->j_list_lock);
225 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
226 mapping = jinode->i_vfs_inode->i_mapping;
227 jinode->i_flags |= JI_COMMIT_RUNNING;
228 spin_unlock(&journal->j_list_lock);
230 * submit the inode data buffers. We use writepage
231 * instead of writepages. Because writepages can do
232 * block allocation with delalloc. We need to write
233 * only allocated blocks here.
235 err = journal_submit_inode_data_buffers(mapping);
236 if (!ret)
237 ret = err;
238 spin_lock(&journal->j_list_lock);
239 J_ASSERT(jinode->i_transaction == commit_transaction);
240 jinode->i_flags &= ~JI_COMMIT_RUNNING;
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 spin_unlock(&journal->j_list_lock);
244 return ret;
248 * Wait for data submitted for writeout, refile inodes to proper
249 * transaction if needed.
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 transaction_t *commit_transaction)
255 struct jbd2_inode *jinode, *next_i;
256 int err, ret = 0;
258 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 jinode->i_flags |= JI_COMMIT_RUNNING;
262 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
264 if (err) {
266 * Because AS_EIO is cleared by
267 * wait_on_page_writeback_range(), set it again so
268 * that user process can get -EIO from fsync().
270 set_bit(AS_EIO,
271 &jinode->i_vfs_inode->i_mapping->flags);
273 if (!ret)
274 ret = err;
276 spin_lock(&journal->j_list_lock);
277 jinode->i_flags &= ~JI_COMMIT_RUNNING;
278 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281 /* Now refile inode to proper lists */
282 list_for_each_entry_safe(jinode, next_i,
283 &commit_transaction->t_inode_list, i_list) {
284 list_del(&jinode->i_list);
285 if (jinode->i_next_transaction) {
286 jinode->i_transaction = jinode->i_next_transaction;
287 jinode->i_next_transaction = NULL;
288 list_add(&jinode->i_list,
289 &jinode->i_transaction->t_inode_list);
290 } else {
291 jinode->i_transaction = NULL;
294 spin_unlock(&journal->j_list_lock);
296 return ret;
299 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
301 struct page *page = bh->b_page;
302 char *addr;
303 __u32 checksum;
305 addr = kmap_atomic(page, KM_USER0);
306 checksum = crc32_be(crc32_sum,
307 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
308 kunmap_atomic(addr, KM_USER0);
310 return checksum;
313 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
314 unsigned long long block)
316 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
317 if (tag_bytes > JBD2_TAG_SIZE32)
318 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322 * jbd2_journal_commit_transaction
324 * The primary function for committing a transaction to the log. This
325 * function is called by the journal thread to begin a complete commit.
327 void jbd2_journal_commit_transaction(journal_t *journal)
329 struct transaction_stats_s stats;
330 transaction_t *commit_transaction;
331 struct journal_head *jh, *new_jh, *descriptor;
332 struct buffer_head **wbuf = journal->j_wbuf;
333 int bufs;
334 int flags;
335 int err;
336 unsigned long long blocknr;
337 char *tagp = NULL;
338 journal_header_t *header;
339 journal_block_tag_t *tag = NULL;
340 int space_left = 0;
341 int first_tag = 0;
342 int tag_flag;
343 int i;
344 int tag_bytes = journal_tag_bytes(journal);
345 struct buffer_head *cbh = NULL; /* For transactional checksums */
346 __u32 crc32_sum = ~0;
349 * First job: lock down the current transaction and wait for
350 * all outstanding updates to complete.
353 #ifdef COMMIT_STATS
354 spin_lock(&journal->j_list_lock);
355 summarise_journal_usage(journal);
356 spin_unlock(&journal->j_list_lock);
357 #endif
359 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
360 if (journal->j_flags & JBD2_FLUSHED) {
361 jbd_debug(3, "super block updated\n");
362 jbd2_journal_update_superblock(journal, 1);
363 } else {
364 jbd_debug(3, "superblock not updated\n");
367 J_ASSERT(journal->j_running_transaction != NULL);
368 J_ASSERT(journal->j_committing_transaction == NULL);
370 commit_transaction = journal->j_running_transaction;
371 J_ASSERT(commit_transaction->t_state == T_RUNNING);
373 jbd_debug(1, "JBD: starting commit of transaction %d\n",
374 commit_transaction->t_tid);
376 spin_lock(&journal->j_state_lock);
377 commit_transaction->t_state = T_LOCKED;
379 stats.u.run.rs_wait = commit_transaction->t_max_wait;
380 stats.u.run.rs_locked = jiffies;
381 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
382 stats.u.run.rs_locked);
384 spin_lock(&commit_transaction->t_handle_lock);
385 while (commit_transaction->t_updates) {
386 DEFINE_WAIT(wait);
388 prepare_to_wait(&journal->j_wait_updates, &wait,
389 TASK_UNINTERRUPTIBLE);
390 if (commit_transaction->t_updates) {
391 spin_unlock(&commit_transaction->t_handle_lock);
392 spin_unlock(&journal->j_state_lock);
393 schedule();
394 spin_lock(&journal->j_state_lock);
395 spin_lock(&commit_transaction->t_handle_lock);
397 finish_wait(&journal->j_wait_updates, &wait);
399 spin_unlock(&commit_transaction->t_handle_lock);
401 J_ASSERT (commit_transaction->t_outstanding_credits <=
402 journal->j_max_transaction_buffers);
405 * First thing we are allowed to do is to discard any remaining
406 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
407 * that there are no such buffers: if a large filesystem
408 * operation like a truncate needs to split itself over multiple
409 * transactions, then it may try to do a jbd2_journal_restart() while
410 * there are still BJ_Reserved buffers outstanding. These must
411 * be released cleanly from the current transaction.
413 * In this case, the filesystem must still reserve write access
414 * again before modifying the buffer in the new transaction, but
415 * we do not require it to remember exactly which old buffers it
416 * has reserved. This is consistent with the existing behaviour
417 * that multiple jbd2_journal_get_write_access() calls to the same
418 * buffer are perfectly permissable.
420 while (commit_transaction->t_reserved_list) {
421 jh = commit_transaction->t_reserved_list;
422 JBUFFER_TRACE(jh, "reserved, unused: refile");
424 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
425 * leave undo-committed data.
427 if (jh->b_committed_data) {
428 struct buffer_head *bh = jh2bh(jh);
430 jbd_lock_bh_state(bh);
431 jbd2_free(jh->b_committed_data, bh->b_size);
432 jh->b_committed_data = NULL;
433 jbd_unlock_bh_state(bh);
435 jbd2_journal_refile_buffer(journal, jh);
439 * Now try to drop any written-back buffers from the journal's
440 * checkpoint lists. We do this *before* commit because it potentially
441 * frees some memory
443 spin_lock(&journal->j_list_lock);
444 __jbd2_journal_clean_checkpoint_list(journal);
445 spin_unlock(&journal->j_list_lock);
447 jbd_debug (3, "JBD: commit phase 1\n");
450 * Switch to a new revoke table.
452 jbd2_journal_switch_revoke_table(journal);
454 stats.u.run.rs_flushing = jiffies;
455 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
456 stats.u.run.rs_flushing);
458 commit_transaction->t_state = T_FLUSH;
459 journal->j_committing_transaction = commit_transaction;
460 journal->j_running_transaction = NULL;
461 commit_transaction->t_log_start = journal->j_head;
462 wake_up(&journal->j_wait_transaction_locked);
463 spin_unlock(&journal->j_state_lock);
465 jbd_debug (3, "JBD: commit phase 2\n");
468 * Now start flushing things to disk, in the order they appear
469 * on the transaction lists. Data blocks go first.
471 err = journal_submit_data_buffers(journal, commit_transaction);
472 if (err)
473 jbd2_journal_abort(journal, err);
475 jbd2_journal_write_revoke_records(journal, commit_transaction);
477 jbd_debug(3, "JBD: commit phase 2\n");
480 * Way to go: we have now written out all of the data for a
481 * transaction! Now comes the tricky part: we need to write out
482 * metadata. Loop over the transaction's entire buffer list:
484 spin_lock(&journal->j_state_lock);
485 commit_transaction->t_state = T_COMMIT;
486 spin_unlock(&journal->j_state_lock);
488 stats.u.run.rs_logging = jiffies;
489 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
490 stats.u.run.rs_logging);
491 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
492 stats.u.run.rs_blocks_logged = 0;
494 J_ASSERT(commit_transaction->t_nr_buffers <=
495 commit_transaction->t_outstanding_credits);
497 err = 0;
498 descriptor = NULL;
499 bufs = 0;
500 while (commit_transaction->t_buffers) {
502 /* Find the next buffer to be journaled... */
504 jh = commit_transaction->t_buffers;
506 /* If we're in abort mode, we just un-journal the buffer and
507 release it for background writing. */
509 if (is_journal_aborted(journal)) {
510 JBUFFER_TRACE(jh, "journal is aborting: refile");
511 jbd2_journal_refile_buffer(journal, jh);
512 /* If that was the last one, we need to clean up
513 * any descriptor buffers which may have been
514 * already allocated, even if we are now
515 * aborting. */
516 if (!commit_transaction->t_buffers)
517 goto start_journal_io;
518 continue;
521 /* Make sure we have a descriptor block in which to
522 record the metadata buffer. */
524 if (!descriptor) {
525 struct buffer_head *bh;
527 J_ASSERT (bufs == 0);
529 jbd_debug(4, "JBD: get descriptor\n");
531 descriptor = jbd2_journal_get_descriptor_buffer(journal);
532 if (!descriptor) {
533 jbd2_journal_abort(journal, -EIO);
534 continue;
537 bh = jh2bh(descriptor);
538 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 (unsigned long long)bh->b_blocknr, bh->b_data);
540 header = (journal_header_t *)&bh->b_data[0];
541 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
543 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
545 tagp = &bh->b_data[sizeof(journal_header_t)];
546 space_left = bh->b_size - sizeof(journal_header_t);
547 first_tag = 1;
548 set_buffer_jwrite(bh);
549 set_buffer_dirty(bh);
550 wbuf[bufs++] = bh;
552 /* Record it so that we can wait for IO
553 completion later */
554 BUFFER_TRACE(bh, "ph3: file as descriptor");
555 jbd2_journal_file_buffer(descriptor, commit_transaction,
556 BJ_LogCtl);
559 /* Where is the buffer to be written? */
561 err = jbd2_journal_next_log_block(journal, &blocknr);
562 /* If the block mapping failed, just abandon the buffer
563 and repeat this loop: we'll fall into the
564 refile-on-abort condition above. */
565 if (err) {
566 jbd2_journal_abort(journal, err);
567 continue;
571 * start_this_handle() uses t_outstanding_credits to determine
572 * the free space in the log, but this counter is changed
573 * by jbd2_journal_next_log_block() also.
575 commit_transaction->t_outstanding_credits--;
577 /* Bump b_count to prevent truncate from stumbling over
578 the shadowed buffer! @@@ This can go if we ever get
579 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 atomic_inc(&jh2bh(jh)->b_count);
582 /* Make a temporary IO buffer with which to write it out
583 (this will requeue both the metadata buffer and the
584 temporary IO buffer). new_bh goes on BJ_IO*/
586 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588 * akpm: jbd2_journal_write_metadata_buffer() sets
589 * new_bh->b_transaction to commit_transaction.
590 * We need to clean this up before we release new_bh
591 * (which is of type BJ_IO)
593 JBUFFER_TRACE(jh, "ph3: write metadata");
594 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
595 jh, &new_jh, blocknr);
596 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 wbuf[bufs++] = jh2bh(new_jh);
599 /* Record the new block's tag in the current descriptor
600 buffer */
602 tag_flag = 0;
603 if (flags & 1)
604 tag_flag |= JBD2_FLAG_ESCAPE;
605 if (!first_tag)
606 tag_flag |= JBD2_FLAG_SAME_UUID;
608 tag = (journal_block_tag_t *) tagp;
609 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
610 tag->t_flags = cpu_to_be32(tag_flag);
611 tagp += tag_bytes;
612 space_left -= tag_bytes;
614 if (first_tag) {
615 memcpy (tagp, journal->j_uuid, 16);
616 tagp += 16;
617 space_left -= 16;
618 first_tag = 0;
621 /* If there's no more to do, or if the descriptor is full,
622 let the IO rip! */
624 if (bufs == journal->j_wbufsize ||
625 commit_transaction->t_buffers == NULL ||
626 space_left < tag_bytes + 16) {
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to
632 the last tag we set up. */
634 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
636 start_journal_io:
637 for (i = 0; i < bufs; i++) {
638 struct buffer_head *bh = wbuf[i];
640 * Compute checksum.
642 if (JBD2_HAS_COMPAT_FEATURE(journal,
643 JBD2_FEATURE_COMPAT_CHECKSUM)) {
644 crc32_sum =
645 jbd2_checksum_data(crc32_sum, bh);
648 lock_buffer(bh);
649 clear_buffer_dirty(bh);
650 set_buffer_uptodate(bh);
651 bh->b_end_io = journal_end_buffer_io_sync;
652 submit_bh(WRITE, bh);
654 cond_resched();
655 stats.u.run.rs_blocks_logged += bufs;
657 /* Force a new descriptor to be generated next
658 time round the loop. */
659 descriptor = NULL;
660 bufs = 0;
664 /* Done it all: now write the commit record asynchronously. */
666 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
667 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
668 err = journal_submit_commit_record(journal, commit_transaction,
669 &cbh, crc32_sum);
670 if (err)
671 __jbd2_journal_abort_hard(journal);
675 * This is the right place to wait for data buffers both for ASYNC
676 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
677 * the commit block went to disk (which happens above). If commit is
678 * SYNC, we need to wait for data buffers before we start writing
679 * commit block, which happens below in such setting.
681 err = journal_finish_inode_data_buffers(journal, commit_transaction);
682 if (err) {
683 char b[BDEVNAME_SIZE];
685 printk(KERN_WARNING
686 "JBD2: Detected IO errors while flushing file data "
687 "on %s\n", bdevname(journal->j_fs_dev, b));
688 err = 0;
691 /* Lo and behold: we have just managed to send a transaction to
692 the log. Before we can commit it, wait for the IO so far to
693 complete. Control buffers being written are on the
694 transaction's t_log_list queue, and metadata buffers are on
695 the t_iobuf_list queue.
697 Wait for the buffers in reverse order. That way we are
698 less likely to be woken up until all IOs have completed, and
699 so we incur less scheduling load.
702 jbd_debug(3, "JBD: commit phase 3\n");
705 * akpm: these are BJ_IO, and j_list_lock is not needed.
706 * See __journal_try_to_free_buffer.
708 wait_for_iobuf:
709 while (commit_transaction->t_iobuf_list != NULL) {
710 struct buffer_head *bh;
712 jh = commit_transaction->t_iobuf_list->b_tprev;
713 bh = jh2bh(jh);
714 if (buffer_locked(bh)) {
715 wait_on_buffer(bh);
716 goto wait_for_iobuf;
718 if (cond_resched())
719 goto wait_for_iobuf;
721 if (unlikely(!buffer_uptodate(bh)))
722 err = -EIO;
724 clear_buffer_jwrite(bh);
726 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
727 jbd2_journal_unfile_buffer(journal, jh);
730 * ->t_iobuf_list should contain only dummy buffer_heads
731 * which were created by jbd2_journal_write_metadata_buffer().
733 BUFFER_TRACE(bh, "dumping temporary bh");
734 jbd2_journal_put_journal_head(jh);
735 __brelse(bh);
736 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
737 free_buffer_head(bh);
739 /* We also have to unlock and free the corresponding
740 shadowed buffer */
741 jh = commit_transaction->t_shadow_list->b_tprev;
742 bh = jh2bh(jh);
743 clear_bit(BH_JWrite, &bh->b_state);
744 J_ASSERT_BH(bh, buffer_jbddirty(bh));
746 /* The metadata is now released for reuse, but we need
747 to remember it against this transaction so that when
748 we finally commit, we can do any checkpointing
749 required. */
750 JBUFFER_TRACE(jh, "file as BJ_Forget");
751 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
752 /* Wake up any transactions which were waiting for this
753 IO to complete */
754 wake_up_bit(&bh->b_state, BH_Unshadow);
755 JBUFFER_TRACE(jh, "brelse shadowed buffer");
756 __brelse(bh);
759 J_ASSERT (commit_transaction->t_shadow_list == NULL);
761 jbd_debug(3, "JBD: commit phase 4\n");
763 /* Here we wait for the revoke record and descriptor record buffers */
764 wait_for_ctlbuf:
765 while (commit_transaction->t_log_list != NULL) {
766 struct buffer_head *bh;
768 jh = commit_transaction->t_log_list->b_tprev;
769 bh = jh2bh(jh);
770 if (buffer_locked(bh)) {
771 wait_on_buffer(bh);
772 goto wait_for_ctlbuf;
774 if (cond_resched())
775 goto wait_for_ctlbuf;
777 if (unlikely(!buffer_uptodate(bh)))
778 err = -EIO;
780 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
781 clear_buffer_jwrite(bh);
782 jbd2_journal_unfile_buffer(journal, jh);
783 jbd2_journal_put_journal_head(jh);
784 __brelse(bh); /* One for getblk */
785 /* AKPM: bforget here */
788 jbd_debug(3, "JBD: commit phase 5\n");
790 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
791 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
792 err = journal_submit_commit_record(journal, commit_transaction,
793 &cbh, crc32_sum);
794 if (err)
795 __jbd2_journal_abort_hard(journal);
797 if (!err && !is_journal_aborted(journal))
798 err = journal_wait_on_commit_record(cbh);
800 if (err)
801 jbd2_journal_abort(journal, err);
803 /* End of a transaction! Finally, we can do checkpoint
804 processing: any buffers committed as a result of this
805 transaction can be removed from any checkpoint list it was on
806 before. */
808 jbd_debug(3, "JBD: commit phase 6\n");
810 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
811 J_ASSERT(commit_transaction->t_buffers == NULL);
812 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
813 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
814 J_ASSERT(commit_transaction->t_shadow_list == NULL);
815 J_ASSERT(commit_transaction->t_log_list == NULL);
817 restart_loop:
819 * As there are other places (journal_unmap_buffer()) adding buffers
820 * to this list we have to be careful and hold the j_list_lock.
822 spin_lock(&journal->j_list_lock);
823 while (commit_transaction->t_forget) {
824 transaction_t *cp_transaction;
825 struct buffer_head *bh;
827 jh = commit_transaction->t_forget;
828 spin_unlock(&journal->j_list_lock);
829 bh = jh2bh(jh);
830 jbd_lock_bh_state(bh);
831 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
832 jh->b_transaction == journal->j_running_transaction);
835 * If there is undo-protected committed data against
836 * this buffer, then we can remove it now. If it is a
837 * buffer needing such protection, the old frozen_data
838 * field now points to a committed version of the
839 * buffer, so rotate that field to the new committed
840 * data.
842 * Otherwise, we can just throw away the frozen data now.
844 if (jh->b_committed_data) {
845 jbd2_free(jh->b_committed_data, bh->b_size);
846 jh->b_committed_data = NULL;
847 if (jh->b_frozen_data) {
848 jh->b_committed_data = jh->b_frozen_data;
849 jh->b_frozen_data = NULL;
851 } else if (jh->b_frozen_data) {
852 jbd2_free(jh->b_frozen_data, bh->b_size);
853 jh->b_frozen_data = NULL;
856 spin_lock(&journal->j_list_lock);
857 cp_transaction = jh->b_cp_transaction;
858 if (cp_transaction) {
859 JBUFFER_TRACE(jh, "remove from old cp transaction");
860 cp_transaction->t_chp_stats.cs_dropped++;
861 __jbd2_journal_remove_checkpoint(jh);
864 /* Only re-checkpoint the buffer_head if it is marked
865 * dirty. If the buffer was added to the BJ_Forget list
866 * by jbd2_journal_forget, it may no longer be dirty and
867 * there's no point in keeping a checkpoint record for
868 * it. */
870 /* A buffer which has been freed while still being
871 * journaled by a previous transaction may end up still
872 * being dirty here, but we want to avoid writing back
873 * that buffer in the future now that the last use has
874 * been committed. That's not only a performance gain,
875 * it also stops aliasing problems if the buffer is left
876 * behind for writeback and gets reallocated for another
877 * use in a different page. */
878 if (buffer_freed(bh)) {
879 clear_buffer_freed(bh);
880 clear_buffer_jbddirty(bh);
883 if (buffer_jbddirty(bh)) {
884 JBUFFER_TRACE(jh, "add to new checkpointing trans");
885 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
886 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
887 __jbd2_journal_refile_buffer(jh);
888 jbd_unlock_bh_state(bh);
889 } else {
890 J_ASSERT_BH(bh, !buffer_dirty(bh));
891 /* The buffer on BJ_Forget list and not jbddirty means
892 * it has been freed by this transaction and hence it
893 * could not have been reallocated until this
894 * transaction has committed. *BUT* it could be
895 * reallocated once we have written all the data to
896 * disk and before we process the buffer on BJ_Forget
897 * list. */
898 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
899 __jbd2_journal_refile_buffer(jh);
900 if (!jh->b_transaction) {
901 jbd_unlock_bh_state(bh);
902 /* needs a brelse */
903 jbd2_journal_remove_journal_head(bh);
904 release_buffer_page(bh);
905 } else
906 jbd_unlock_bh_state(bh);
908 cond_resched_lock(&journal->j_list_lock);
910 spin_unlock(&journal->j_list_lock);
912 * This is a bit sleazy. We use j_list_lock to protect transition
913 * of a transaction into T_FINISHED state and calling
914 * __jbd2_journal_drop_transaction(). Otherwise we could race with
915 * other checkpointing code processing the transaction...
917 spin_lock(&journal->j_state_lock);
918 spin_lock(&journal->j_list_lock);
920 * Now recheck if some buffers did not get attached to the transaction
921 * while the lock was dropped...
923 if (commit_transaction->t_forget) {
924 spin_unlock(&journal->j_list_lock);
925 spin_unlock(&journal->j_state_lock);
926 goto restart_loop;
929 /* Done with this transaction! */
931 jbd_debug(3, "JBD: commit phase 7\n");
933 J_ASSERT(commit_transaction->t_state == T_COMMIT);
935 commit_transaction->t_start = jiffies;
936 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
937 commit_transaction->t_start);
940 * File the transaction for history
942 stats.ts_type = JBD2_STATS_RUN;
943 stats.ts_tid = commit_transaction->t_tid;
944 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
945 spin_lock(&journal->j_history_lock);
946 memcpy(journal->j_history + journal->j_history_cur, &stats,
947 sizeof(stats));
948 if (++journal->j_history_cur == journal->j_history_max)
949 journal->j_history_cur = 0;
952 * Calculate overall stats
954 journal->j_stats.ts_tid++;
955 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
956 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
957 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
958 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
959 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
960 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
961 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
962 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
963 spin_unlock(&journal->j_history_lock);
965 commit_transaction->t_state = T_FINISHED;
966 J_ASSERT(commit_transaction == journal->j_committing_transaction);
967 journal->j_commit_sequence = commit_transaction->t_tid;
968 journal->j_committing_transaction = NULL;
969 spin_unlock(&journal->j_state_lock);
971 if (commit_transaction->t_checkpoint_list == NULL &&
972 commit_transaction->t_checkpoint_io_list == NULL) {
973 __jbd2_journal_drop_transaction(journal, commit_transaction);
974 } else {
975 if (journal->j_checkpoint_transactions == NULL) {
976 journal->j_checkpoint_transactions = commit_transaction;
977 commit_transaction->t_cpnext = commit_transaction;
978 commit_transaction->t_cpprev = commit_transaction;
979 } else {
980 commit_transaction->t_cpnext =
981 journal->j_checkpoint_transactions;
982 commit_transaction->t_cpprev =
983 commit_transaction->t_cpnext->t_cpprev;
984 commit_transaction->t_cpnext->t_cpprev =
985 commit_transaction;
986 commit_transaction->t_cpprev->t_cpnext =
987 commit_transaction;
990 spin_unlock(&journal->j_list_lock);
992 jbd_debug(1, "JBD: commit %d complete, head %d\n",
993 journal->j_commit_sequence, journal->j_tail_sequence);
995 wake_up(&journal->j_wait_done_commit);