Linux 2.6.27.29
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / fs / jbd2 / commit.c
blobb1f07565ae20aedd7850fb3660187641fbba1bec
1 /*
2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
30 * Default IO end handler for temporary BJ_IO buffer_heads.
32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
34 BUFFER_TRACE(bh, "");
35 if (uptodate)
36 set_buffer_uptodate(bh);
37 else
38 clear_buffer_uptodate(bh);
39 unlock_buffer(bh);
43 * When an ext4 file is truncated, it is possible that some pages are not
44 * successfully freed, because they are attached to a committing transaction.
45 * After the transaction commits, these pages are left on the LRU, with no
46 * ->mapping, and with attached buffers. These pages are trivially reclaimable
47 * by the VM, but their apparent absence upsets the VM accounting, and it makes
48 * the numbers in /proc/meminfo look odd.
50 * So here, we have a buffer which has just come off the forget list. Look to
51 * see if we can strip all buffers from the backing page.
53 * Called under lock_journal(), and possibly under journal_datalist_lock. The
54 * caller provided us with a ref against the buffer, and we drop that here.
56 static void release_buffer_page(struct buffer_head *bh)
58 struct page *page;
60 if (buffer_dirty(bh))
61 goto nope;
62 if (atomic_read(&bh->b_count) != 1)
63 goto nope;
64 page = bh->b_page;
65 if (!page)
66 goto nope;
67 if (page->mapping)
68 goto nope;
70 /* OK, it's a truncated page */
71 if (!trylock_page(page))
72 goto nope;
74 page_cache_get(page);
75 __brelse(bh);
76 try_to_free_buffers(page);
77 unlock_page(page);
78 page_cache_release(page);
79 return;
81 nope:
82 __brelse(bh);
86 * Done it all: now submit the commit record. We should have
87 * cleaned up our previous buffers by now, so if we are in abort
88 * mode we can now just skip the rest of the journal write
89 * entirely.
91 * Returns 1 if the journal needs to be aborted or 0 on success
93 static int journal_submit_commit_record(journal_t *journal,
94 transaction_t *commit_transaction,
95 struct buffer_head **cbh,
96 __u32 crc32_sum)
98 struct journal_head *descriptor;
99 struct commit_header *tmp;
100 struct buffer_head *bh;
101 int ret;
102 int barrier_done = 0;
103 struct timespec now = current_kernel_time();
105 if (is_journal_aborted(journal))
106 return 0;
108 descriptor = jbd2_journal_get_descriptor_buffer(journal);
109 if (!descriptor)
110 return 1;
112 bh = jh2bh(descriptor);
114 tmp = (struct commit_header *)bh->b_data;
115 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
116 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
117 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
118 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
119 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121 if (JBD2_HAS_COMPAT_FEATURE(journal,
122 JBD2_FEATURE_COMPAT_CHECKSUM)) {
123 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
124 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
125 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
128 JBUFFER_TRACE(descriptor, "submit commit block");
129 lock_buffer(bh);
130 clear_buffer_dirty(bh);
131 set_buffer_uptodate(bh);
132 bh->b_end_io = journal_end_buffer_io_sync;
134 if (journal->j_flags & JBD2_BARRIER &&
135 !JBD2_HAS_INCOMPAT_FEATURE(journal,
136 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
137 set_buffer_ordered(bh);
138 barrier_done = 1;
140 ret = submit_bh(WRITE, bh);
141 if (barrier_done)
142 clear_buffer_ordered(bh);
144 /* is it possible for another commit to fail at roughly
145 * the same time as this one? If so, we don't want to
146 * trust the barrier flag in the super, but instead want
147 * to remember if we sent a barrier request
149 if (ret == -EOPNOTSUPP && barrier_done) {
150 char b[BDEVNAME_SIZE];
152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - "
154 "disabling barriers\n",
155 bdevname(journal->j_dev, b));
156 spin_lock(&journal->j_state_lock);
157 journal->j_flags &= ~JBD2_BARRIER;
158 spin_unlock(&journal->j_state_lock);
160 /* And try again, without the barrier */
161 lock_buffer(bh);
162 set_buffer_uptodate(bh);
163 clear_buffer_dirty(bh);
164 ret = submit_bh(WRITE, bh);
166 *cbh = bh;
167 return ret;
171 * This function along with journal_submit_commit_record
172 * allows to write the commit record asynchronously.
174 static int journal_wait_on_commit_record(journal_t *journal,
175 struct buffer_head *bh)
177 int ret = 0;
179 retry:
180 clear_buffer_dirty(bh);
181 wait_on_buffer(bh);
182 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
183 printk(KERN_WARNING
184 "JBD2: wait_on_commit_record: sync failed on %s - "
185 "disabling barriers\n", journal->j_devname);
186 spin_lock(&journal->j_state_lock);
187 journal->j_flags &= ~JBD2_BARRIER;
188 spin_unlock(&journal->j_state_lock);
190 lock_buffer(bh);
191 clear_buffer_dirty(bh);
192 set_buffer_uptodate(bh);
193 bh->b_end_io = journal_end_buffer_io_sync;
195 ret = submit_bh(WRITE_SYNC, bh);
196 if (ret) {
197 unlock_buffer(bh);
198 return ret;
200 goto retry;
203 if (unlikely(!buffer_uptodate(bh)))
204 ret = -EIO;
205 put_bh(bh); /* One for getblk() */
206 jbd2_journal_put_journal_head(bh2jh(bh));
208 return ret;
212 * write the filemap data using writepage() address_space_operations.
213 * We don't do block allocation here even for delalloc. We don't
214 * use writepages() because with dealyed allocation we may be doing
215 * block allocation in writepages().
217 static int journal_submit_inode_data_buffers(struct address_space *mapping)
219 int ret;
220 struct writeback_control wbc = {
221 .sync_mode = WB_SYNC_ALL,
222 .nr_to_write = mapping->nrpages * 2,
223 .range_start = 0,
224 .range_end = i_size_read(mapping->host),
225 .for_writepages = 1,
228 ret = generic_writepages(mapping, &wbc);
229 return ret;
233 * Submit all the data buffers of inode associated with the transaction to
234 * disk.
236 * We are in a committing transaction. Therefore no new inode can be added to
237 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
238 * operate on from being released while we write out pages.
240 static int journal_submit_data_buffers(journal_t *journal,
241 transaction_t *commit_transaction)
243 struct jbd2_inode *jinode;
244 int err, ret = 0;
245 struct address_space *mapping;
247 spin_lock(&journal->j_list_lock);
248 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
249 mapping = jinode->i_vfs_inode->i_mapping;
250 jinode->i_flags |= JI_COMMIT_RUNNING;
251 spin_unlock(&journal->j_list_lock);
253 * submit the inode data buffers. We use writepage
254 * instead of writepages. Because writepages can do
255 * block allocation with delalloc. We need to write
256 * only allocated blocks here.
258 err = journal_submit_inode_data_buffers(mapping);
259 if (!ret)
260 ret = err;
261 spin_lock(&journal->j_list_lock);
262 J_ASSERT(jinode->i_transaction == commit_transaction);
263 jinode->i_flags &= ~JI_COMMIT_RUNNING;
264 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
266 spin_unlock(&journal->j_list_lock);
267 return ret;
271 * Wait for data submitted for writeout, refile inodes to proper
272 * transaction if needed.
275 static int journal_finish_inode_data_buffers(journal_t *journal,
276 transaction_t *commit_transaction)
278 struct jbd2_inode *jinode, *next_i;
279 int err, ret = 0;
281 /* For locking, see the comment in journal_submit_data_buffers() */
282 spin_lock(&journal->j_list_lock);
283 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
284 jinode->i_flags |= JI_COMMIT_RUNNING;
285 spin_unlock(&journal->j_list_lock);
286 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
287 if (err) {
289 * Because AS_EIO is cleared by
290 * wait_on_page_writeback_range(), set it again so
291 * that user process can get -EIO from fsync().
293 set_bit(AS_EIO,
294 &jinode->i_vfs_inode->i_mapping->flags);
296 if (!ret)
297 ret = err;
299 spin_lock(&journal->j_list_lock);
300 jinode->i_flags &= ~JI_COMMIT_RUNNING;
301 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
304 /* Now refile inode to proper lists */
305 list_for_each_entry_safe(jinode, next_i,
306 &commit_transaction->t_inode_list, i_list) {
307 list_del(&jinode->i_list);
308 if (jinode->i_next_transaction) {
309 jinode->i_transaction = jinode->i_next_transaction;
310 jinode->i_next_transaction = NULL;
311 list_add(&jinode->i_list,
312 &jinode->i_transaction->t_inode_list);
313 } else {
314 jinode->i_transaction = NULL;
317 spin_unlock(&journal->j_list_lock);
319 return ret;
322 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
324 struct page *page = bh->b_page;
325 char *addr;
326 __u32 checksum;
328 addr = kmap_atomic(page, KM_USER0);
329 checksum = crc32_be(crc32_sum,
330 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
331 kunmap_atomic(addr, KM_USER0);
333 return checksum;
336 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
337 unsigned long long block)
339 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
340 if (tag_bytes > JBD2_TAG_SIZE32)
341 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
345 * jbd2_journal_commit_transaction
347 * The primary function for committing a transaction to the log. This
348 * function is called by the journal thread to begin a complete commit.
350 void jbd2_journal_commit_transaction(journal_t *journal)
352 struct transaction_stats_s stats;
353 transaction_t *commit_transaction;
354 struct journal_head *jh, *new_jh, *descriptor;
355 struct buffer_head **wbuf = journal->j_wbuf;
356 int bufs;
357 int flags;
358 int err;
359 unsigned long long blocknr;
360 char *tagp = NULL;
361 journal_header_t *header;
362 journal_block_tag_t *tag = NULL;
363 int space_left = 0;
364 int first_tag = 0;
365 int tag_flag;
366 int i;
367 int tag_bytes = journal_tag_bytes(journal);
368 struct buffer_head *cbh = NULL; /* For transactional checksums */
369 __u32 crc32_sum = ~0;
372 * First job: lock down the current transaction and wait for
373 * all outstanding updates to complete.
376 #ifdef COMMIT_STATS
377 spin_lock(&journal->j_list_lock);
378 summarise_journal_usage(journal);
379 spin_unlock(&journal->j_list_lock);
380 #endif
382 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
383 if (journal->j_flags & JBD2_FLUSHED) {
384 jbd_debug(3, "super block updated\n");
385 jbd2_journal_update_superblock(journal, 1);
386 } else {
387 jbd_debug(3, "superblock not updated\n");
390 J_ASSERT(journal->j_running_transaction != NULL);
391 J_ASSERT(journal->j_committing_transaction == NULL);
393 commit_transaction = journal->j_running_transaction;
394 J_ASSERT(commit_transaction->t_state == T_RUNNING);
396 jbd_debug(1, "JBD: starting commit of transaction %d\n",
397 commit_transaction->t_tid);
399 spin_lock(&journal->j_state_lock);
400 commit_transaction->t_state = T_LOCKED;
402 stats.u.run.rs_wait = commit_transaction->t_max_wait;
403 stats.u.run.rs_locked = jiffies;
404 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
405 stats.u.run.rs_locked);
407 spin_lock(&commit_transaction->t_handle_lock);
408 while (commit_transaction->t_updates) {
409 DEFINE_WAIT(wait);
411 prepare_to_wait(&journal->j_wait_updates, &wait,
412 TASK_UNINTERRUPTIBLE);
413 if (commit_transaction->t_updates) {
414 spin_unlock(&commit_transaction->t_handle_lock);
415 spin_unlock(&journal->j_state_lock);
416 schedule();
417 spin_lock(&journal->j_state_lock);
418 spin_lock(&commit_transaction->t_handle_lock);
420 finish_wait(&journal->j_wait_updates, &wait);
422 spin_unlock(&commit_transaction->t_handle_lock);
424 J_ASSERT (commit_transaction->t_outstanding_credits <=
425 journal->j_max_transaction_buffers);
428 * First thing we are allowed to do is to discard any remaining
429 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
430 * that there are no such buffers: if a large filesystem
431 * operation like a truncate needs to split itself over multiple
432 * transactions, then it may try to do a jbd2_journal_restart() while
433 * there are still BJ_Reserved buffers outstanding. These must
434 * be released cleanly from the current transaction.
436 * In this case, the filesystem must still reserve write access
437 * again before modifying the buffer in the new transaction, but
438 * we do not require it to remember exactly which old buffers it
439 * has reserved. This is consistent with the existing behaviour
440 * that multiple jbd2_journal_get_write_access() calls to the same
441 * buffer are perfectly permissable.
443 while (commit_transaction->t_reserved_list) {
444 jh = commit_transaction->t_reserved_list;
445 JBUFFER_TRACE(jh, "reserved, unused: refile");
447 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
448 * leave undo-committed data.
450 if (jh->b_committed_data) {
451 struct buffer_head *bh = jh2bh(jh);
453 jbd_lock_bh_state(bh);
454 jbd2_free(jh->b_committed_data, bh->b_size);
455 jh->b_committed_data = NULL;
456 jbd_unlock_bh_state(bh);
458 jbd2_journal_refile_buffer(journal, jh);
462 * Now try to drop any written-back buffers from the journal's
463 * checkpoint lists. We do this *before* commit because it potentially
464 * frees some memory
466 spin_lock(&journal->j_list_lock);
467 __jbd2_journal_clean_checkpoint_list(journal);
468 spin_unlock(&journal->j_list_lock);
470 jbd_debug (3, "JBD: commit phase 1\n");
473 * Switch to a new revoke table.
475 jbd2_journal_switch_revoke_table(journal);
477 stats.u.run.rs_flushing = jiffies;
478 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
479 stats.u.run.rs_flushing);
481 commit_transaction->t_state = T_FLUSH;
482 journal->j_committing_transaction = commit_transaction;
483 journal->j_running_transaction = NULL;
484 commit_transaction->t_log_start = journal->j_head;
485 wake_up(&journal->j_wait_transaction_locked);
486 spin_unlock(&journal->j_state_lock);
488 jbd_debug (3, "JBD: commit phase 2\n");
491 * Now start flushing things to disk, in the order they appear
492 * on the transaction lists. Data blocks go first.
494 err = journal_submit_data_buffers(journal, commit_transaction);
495 if (err)
496 jbd2_journal_abort(journal, err);
498 jbd2_journal_write_revoke_records(journal, commit_transaction);
500 jbd_debug(3, "JBD: commit phase 2\n");
503 * Way to go: we have now written out all of the data for a
504 * transaction! Now comes the tricky part: we need to write out
505 * metadata. Loop over the transaction's entire buffer list:
507 spin_lock(&journal->j_state_lock);
508 commit_transaction->t_state = T_COMMIT;
509 spin_unlock(&journal->j_state_lock);
511 stats.u.run.rs_logging = jiffies;
512 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
513 stats.u.run.rs_logging);
514 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
515 stats.u.run.rs_blocks_logged = 0;
517 J_ASSERT(commit_transaction->t_nr_buffers <=
518 commit_transaction->t_outstanding_credits);
520 err = 0;
521 descriptor = NULL;
522 bufs = 0;
523 while (commit_transaction->t_buffers) {
525 /* Find the next buffer to be journaled... */
527 jh = commit_transaction->t_buffers;
529 /* If we're in abort mode, we just un-journal the buffer and
530 release it for background writing. */
532 if (is_journal_aborted(journal)) {
533 JBUFFER_TRACE(jh, "journal is aborting: refile");
534 jbd2_journal_refile_buffer(journal, jh);
535 /* If that was the last one, we need to clean up
536 * any descriptor buffers which may have been
537 * already allocated, even if we are now
538 * aborting. */
539 if (!commit_transaction->t_buffers)
540 goto start_journal_io;
541 continue;
544 /* Make sure we have a descriptor block in which to
545 record the metadata buffer. */
547 if (!descriptor) {
548 struct buffer_head *bh;
550 J_ASSERT (bufs == 0);
552 jbd_debug(4, "JBD: get descriptor\n");
554 descriptor = jbd2_journal_get_descriptor_buffer(journal);
555 if (!descriptor) {
556 jbd2_journal_abort(journal, -EIO);
557 continue;
560 bh = jh2bh(descriptor);
561 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
562 (unsigned long long)bh->b_blocknr, bh->b_data);
563 header = (journal_header_t *)&bh->b_data[0];
564 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
565 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
566 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
568 tagp = &bh->b_data[sizeof(journal_header_t)];
569 space_left = bh->b_size - sizeof(journal_header_t);
570 first_tag = 1;
571 set_buffer_jwrite(bh);
572 set_buffer_dirty(bh);
573 wbuf[bufs++] = bh;
575 /* Record it so that we can wait for IO
576 completion later */
577 BUFFER_TRACE(bh, "ph3: file as descriptor");
578 jbd2_journal_file_buffer(descriptor, commit_transaction,
579 BJ_LogCtl);
582 /* Where is the buffer to be written? */
584 err = jbd2_journal_next_log_block(journal, &blocknr);
585 /* If the block mapping failed, just abandon the buffer
586 and repeat this loop: we'll fall into the
587 refile-on-abort condition above. */
588 if (err) {
589 jbd2_journal_abort(journal, err);
590 continue;
594 * start_this_handle() uses t_outstanding_credits to determine
595 * the free space in the log, but this counter is changed
596 * by jbd2_journal_next_log_block() also.
598 commit_transaction->t_outstanding_credits--;
600 /* Bump b_count to prevent truncate from stumbling over
601 the shadowed buffer! @@@ This can go if we ever get
602 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
603 atomic_inc(&jh2bh(jh)->b_count);
605 /* Make a temporary IO buffer with which to write it out
606 (this will requeue both the metadata buffer and the
607 temporary IO buffer). new_bh goes on BJ_IO*/
609 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
611 * akpm: jbd2_journal_write_metadata_buffer() sets
612 * new_bh->b_transaction to commit_transaction.
613 * We need to clean this up before we release new_bh
614 * (which is of type BJ_IO)
616 JBUFFER_TRACE(jh, "ph3: write metadata");
617 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
618 jh, &new_jh, blocknr);
619 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
620 wbuf[bufs++] = jh2bh(new_jh);
622 /* Record the new block's tag in the current descriptor
623 buffer */
625 tag_flag = 0;
626 if (flags & 1)
627 tag_flag |= JBD2_FLAG_ESCAPE;
628 if (!first_tag)
629 tag_flag |= JBD2_FLAG_SAME_UUID;
631 tag = (journal_block_tag_t *) tagp;
632 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
633 tag->t_flags = cpu_to_be32(tag_flag);
634 tagp += tag_bytes;
635 space_left -= tag_bytes;
637 if (first_tag) {
638 memcpy (tagp, journal->j_uuid, 16);
639 tagp += 16;
640 space_left -= 16;
641 first_tag = 0;
644 /* If there's no more to do, or if the descriptor is full,
645 let the IO rip! */
647 if (bufs == journal->j_wbufsize ||
648 commit_transaction->t_buffers == NULL ||
649 space_left < tag_bytes + 16) {
651 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
653 /* Write an end-of-descriptor marker before
654 submitting the IOs. "tag" still points to
655 the last tag we set up. */
657 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
659 start_journal_io:
660 for (i = 0; i < bufs; i++) {
661 struct buffer_head *bh = wbuf[i];
663 * Compute checksum.
665 if (JBD2_HAS_COMPAT_FEATURE(journal,
666 JBD2_FEATURE_COMPAT_CHECKSUM)) {
667 crc32_sum =
668 jbd2_checksum_data(crc32_sum, bh);
671 lock_buffer(bh);
672 clear_buffer_dirty(bh);
673 set_buffer_uptodate(bh);
674 bh->b_end_io = journal_end_buffer_io_sync;
675 submit_bh(WRITE, bh);
677 cond_resched();
678 stats.u.run.rs_blocks_logged += bufs;
680 /* Force a new descriptor to be generated next
681 time round the loop. */
682 descriptor = NULL;
683 bufs = 0;
687 /* Done it all: now write the commit record asynchronously. */
689 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
690 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
691 err = journal_submit_commit_record(journal, commit_transaction,
692 &cbh, crc32_sum);
693 if (err)
694 __jbd2_journal_abort_hard(journal);
698 * This is the right place to wait for data buffers both for ASYNC
699 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
700 * the commit block went to disk (which happens above). If commit is
701 * SYNC, we need to wait for data buffers before we start writing
702 * commit block, which happens below in such setting.
704 err = journal_finish_inode_data_buffers(journal, commit_transaction);
705 if (err) {
706 char b[BDEVNAME_SIZE];
708 printk(KERN_WARNING
709 "JBD2: Detected IO errors while flushing file data "
710 "on %s\n", bdevname(journal->j_fs_dev, b));
711 err = 0;
714 /* Lo and behold: we have just managed to send a transaction to
715 the log. Before we can commit it, wait for the IO so far to
716 complete. Control buffers being written are on the
717 transaction's t_log_list queue, and metadata buffers are on
718 the t_iobuf_list queue.
720 Wait for the buffers in reverse order. That way we are
721 less likely to be woken up until all IOs have completed, and
722 so we incur less scheduling load.
725 jbd_debug(3, "JBD: commit phase 3\n");
728 * akpm: these are BJ_IO, and j_list_lock is not needed.
729 * See __journal_try_to_free_buffer.
731 wait_for_iobuf:
732 while (commit_transaction->t_iobuf_list != NULL) {
733 struct buffer_head *bh;
735 jh = commit_transaction->t_iobuf_list->b_tprev;
736 bh = jh2bh(jh);
737 if (buffer_locked(bh)) {
738 wait_on_buffer(bh);
739 goto wait_for_iobuf;
741 if (cond_resched())
742 goto wait_for_iobuf;
744 if (unlikely(!buffer_uptodate(bh)))
745 err = -EIO;
747 clear_buffer_jwrite(bh);
749 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
750 jbd2_journal_unfile_buffer(journal, jh);
753 * ->t_iobuf_list should contain only dummy buffer_heads
754 * which were created by jbd2_journal_write_metadata_buffer().
756 BUFFER_TRACE(bh, "dumping temporary bh");
757 jbd2_journal_put_journal_head(jh);
758 __brelse(bh);
759 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
760 free_buffer_head(bh);
762 /* We also have to unlock and free the corresponding
763 shadowed buffer */
764 jh = commit_transaction->t_shadow_list->b_tprev;
765 bh = jh2bh(jh);
766 clear_bit(BH_JWrite, &bh->b_state);
767 J_ASSERT_BH(bh, buffer_jbddirty(bh));
769 /* The metadata is now released for reuse, but we need
770 to remember it against this transaction so that when
771 we finally commit, we can do any checkpointing
772 required. */
773 JBUFFER_TRACE(jh, "file as BJ_Forget");
774 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
775 /* Wake up any transactions which were waiting for this
776 IO to complete */
777 wake_up_bit(&bh->b_state, BH_Unshadow);
778 JBUFFER_TRACE(jh, "brelse shadowed buffer");
779 __brelse(bh);
782 J_ASSERT (commit_transaction->t_shadow_list == NULL);
784 jbd_debug(3, "JBD: commit phase 4\n");
786 /* Here we wait for the revoke record and descriptor record buffers */
787 wait_for_ctlbuf:
788 while (commit_transaction->t_log_list != NULL) {
789 struct buffer_head *bh;
791 jh = commit_transaction->t_log_list->b_tprev;
792 bh = jh2bh(jh);
793 if (buffer_locked(bh)) {
794 wait_on_buffer(bh);
795 goto wait_for_ctlbuf;
797 if (cond_resched())
798 goto wait_for_ctlbuf;
800 if (unlikely(!buffer_uptodate(bh)))
801 err = -EIO;
803 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
804 clear_buffer_jwrite(bh);
805 jbd2_journal_unfile_buffer(journal, jh);
806 jbd2_journal_put_journal_head(jh);
807 __brelse(bh); /* One for getblk */
808 /* AKPM: bforget here */
811 jbd_debug(3, "JBD: commit phase 5\n");
813 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
815 err = journal_submit_commit_record(journal, commit_transaction,
816 &cbh, crc32_sum);
817 if (err)
818 __jbd2_journal_abort_hard(journal);
820 if (!err && !is_journal_aborted(journal))
821 err = journal_wait_on_commit_record(journal, cbh);
823 if (err)
824 jbd2_journal_abort(journal, err);
826 /* End of a transaction! Finally, we can do checkpoint
827 processing: any buffers committed as a result of this
828 transaction can be removed from any checkpoint list it was on
829 before. */
831 jbd_debug(3, "JBD: commit phase 6\n");
833 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
834 J_ASSERT(commit_transaction->t_buffers == NULL);
835 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
836 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
837 J_ASSERT(commit_transaction->t_shadow_list == NULL);
838 J_ASSERT(commit_transaction->t_log_list == NULL);
840 restart_loop:
842 * As there are other places (journal_unmap_buffer()) adding buffers
843 * to this list we have to be careful and hold the j_list_lock.
845 spin_lock(&journal->j_list_lock);
846 while (commit_transaction->t_forget) {
847 transaction_t *cp_transaction;
848 struct buffer_head *bh;
850 jh = commit_transaction->t_forget;
851 spin_unlock(&journal->j_list_lock);
852 bh = jh2bh(jh);
853 jbd_lock_bh_state(bh);
854 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
855 jh->b_transaction == journal->j_running_transaction);
858 * If there is undo-protected committed data against
859 * this buffer, then we can remove it now. If it is a
860 * buffer needing such protection, the old frozen_data
861 * field now points to a committed version of the
862 * buffer, so rotate that field to the new committed
863 * data.
865 * Otherwise, we can just throw away the frozen data now.
867 if (jh->b_committed_data) {
868 jbd2_free(jh->b_committed_data, bh->b_size);
869 jh->b_committed_data = NULL;
870 if (jh->b_frozen_data) {
871 jh->b_committed_data = jh->b_frozen_data;
872 jh->b_frozen_data = NULL;
874 } else if (jh->b_frozen_data) {
875 jbd2_free(jh->b_frozen_data, bh->b_size);
876 jh->b_frozen_data = NULL;
879 spin_lock(&journal->j_list_lock);
880 cp_transaction = jh->b_cp_transaction;
881 if (cp_transaction) {
882 JBUFFER_TRACE(jh, "remove from old cp transaction");
883 cp_transaction->t_chp_stats.cs_dropped++;
884 __jbd2_journal_remove_checkpoint(jh);
887 /* Only re-checkpoint the buffer_head if it is marked
888 * dirty. If the buffer was added to the BJ_Forget list
889 * by jbd2_journal_forget, it may no longer be dirty and
890 * there's no point in keeping a checkpoint record for
891 * it. */
893 /* A buffer which has been freed while still being
894 * journaled by a previous transaction may end up still
895 * being dirty here, but we want to avoid writing back
896 * that buffer in the future now that the last use has
897 * been committed. That's not only a performance gain,
898 * it also stops aliasing problems if the buffer is left
899 * behind for writeback and gets reallocated for another
900 * use in a different page. */
901 if (buffer_freed(bh)) {
902 clear_buffer_freed(bh);
903 clear_buffer_jbddirty(bh);
906 if (buffer_jbddirty(bh)) {
907 JBUFFER_TRACE(jh, "add to new checkpointing trans");
908 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
909 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
910 __jbd2_journal_refile_buffer(jh);
911 jbd_unlock_bh_state(bh);
912 } else {
913 J_ASSERT_BH(bh, !buffer_dirty(bh));
914 /* The buffer on BJ_Forget list and not jbddirty means
915 * it has been freed by this transaction and hence it
916 * could not have been reallocated until this
917 * transaction has committed. *BUT* it could be
918 * reallocated once we have written all the data to
919 * disk and before we process the buffer on BJ_Forget
920 * list. */
921 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
922 __jbd2_journal_refile_buffer(jh);
923 if (!jh->b_transaction) {
924 jbd_unlock_bh_state(bh);
925 /* needs a brelse */
926 jbd2_journal_remove_journal_head(bh);
927 release_buffer_page(bh);
928 } else
929 jbd_unlock_bh_state(bh);
931 cond_resched_lock(&journal->j_list_lock);
933 spin_unlock(&journal->j_list_lock);
935 * This is a bit sleazy. We use j_list_lock to protect transition
936 * of a transaction into T_FINISHED state and calling
937 * __jbd2_journal_drop_transaction(). Otherwise we could race with
938 * other checkpointing code processing the transaction...
940 spin_lock(&journal->j_state_lock);
941 spin_lock(&journal->j_list_lock);
943 * Now recheck if some buffers did not get attached to the transaction
944 * while the lock was dropped...
946 if (commit_transaction->t_forget) {
947 spin_unlock(&journal->j_list_lock);
948 spin_unlock(&journal->j_state_lock);
949 goto restart_loop;
952 /* Done with this transaction! */
954 jbd_debug(3, "JBD: commit phase 7\n");
956 J_ASSERT(commit_transaction->t_state == T_COMMIT);
958 commit_transaction->t_start = jiffies;
959 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
960 commit_transaction->t_start);
963 * File the transaction for history
965 stats.ts_type = JBD2_STATS_RUN;
966 stats.ts_tid = commit_transaction->t_tid;
967 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
968 spin_lock(&journal->j_history_lock);
969 memcpy(journal->j_history + journal->j_history_cur, &stats,
970 sizeof(stats));
971 if (++journal->j_history_cur == journal->j_history_max)
972 journal->j_history_cur = 0;
975 * Calculate overall stats
977 journal->j_stats.ts_tid++;
978 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
979 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
980 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
981 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
982 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
983 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
984 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
985 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
986 spin_unlock(&journal->j_history_lock);
988 commit_transaction->t_state = T_FINISHED;
989 J_ASSERT(commit_transaction == journal->j_committing_transaction);
990 journal->j_commit_sequence = commit_transaction->t_tid;
991 journal->j_committing_transaction = NULL;
992 spin_unlock(&journal->j_state_lock);
994 if (commit_transaction->t_checkpoint_list == NULL &&
995 commit_transaction->t_checkpoint_io_list == NULL) {
996 __jbd2_journal_drop_transaction(journal, commit_transaction);
997 } else {
998 if (journal->j_checkpoint_transactions == NULL) {
999 journal->j_checkpoint_transactions = commit_transaction;
1000 commit_transaction->t_cpnext = commit_transaction;
1001 commit_transaction->t_cpprev = commit_transaction;
1002 } else {
1003 commit_transaction->t_cpnext =
1004 journal->j_checkpoint_transactions;
1005 commit_transaction->t_cpprev =
1006 commit_transaction->t_cpnext->t_cpprev;
1007 commit_transaction->t_cpnext->t_cpprev =
1008 commit_transaction;
1009 commit_transaction->t_cpprev->t_cpnext =
1010 commit_transaction;
1013 spin_unlock(&journal->j_list_lock);
1015 jbd_debug(1, "JBD: commit %d complete, head %d\n",
1016 journal->j_commit_sequence, journal->j_tail_sequence);
1018 wake_up(&journal->j_wait_done_commit);