From ddb17c8b1eee7a560779f8c6c48fb5439fce8cd8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Apr 2016 23:41:43 -0400 Subject: [PATCH] add patch fix-data-exposure-after-a-crash --- fix-data-exposure-after-a-crash | 82 +++++++++++++++++++++++++++++++++++++++++ series | 2 + timestamps | 7 ++-- 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 fix-data-exposure-after-a-crash diff --git a/fix-data-exposure-after-a-crash b/fix-data-exposure-after-a-crash new file mode 100644 index 00000000..f0830a8b --- /dev/null +++ b/fix-data-exposure-after-a-crash @@ -0,0 +1,82 @@ +ext4: fix data exposure after a crash + +From: Jan Kara + +Huang has reported that in his powerfail testing he is seeing stale +block contents in some of recently allocated blocks although he mounts +ext4 in data=ordered mode. After some investigation I have found out +that indeed when delayed allocation is used, we don't add inode to +transaction's list of inodes needing flushing before commit. Originally +we were doing that but commit f3b59291a69d removed the logic with a +flawed argument that it is not needed. + +The problem is that although for delayed allocated blocks we write their +contents immediately after allocating them, there is no guarantee that +the IO scheduler or device doesn't reorder things and thus transaction +allocating blocks and attaching them to inode can reach stable storage +before actual block contents. Actually whenever we attach freshly +allocated blocks to inode using a written extent, we should add inode to +transaction's ordered inode list to make sure we properly wait for block +contents to be written before committing the transaction. So that is +what we do in this patch. This also handles other cases where stale data +exposure was possible - like filling hole via mmap in +data=ordered,nodelalloc mode. + +The only exception to the above rule are extending direct IO writes where +blkdev_direct_IO() waits for IO to complete before increasing i_size and +thus stale data exposure is not possible. For now we don't complicate +the code with optimizing this special case since the overhead is pretty +low. In case this is observed to be a performance problem we can always +handle it using a special flag to ext4_map_blocks(). + +CC: stable@vger.kernel.org +Fixes: f3b59291a69d0b734be1fc8be489fef2dd846d3d +Reported-by: "HUANG Weller (CM/ESW12-CN)" +Tested-by: "HUANG Weller (CM/ESW12-CN)" +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +--- + fs/ext4/inode.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 981a1fc..576f64a 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -684,6 +684,20 @@ out_sem: + ret = check_block_validity(inode, map); + if (ret != 0) + return ret; ++ ++ /* ++ * Inodes with freshly allocated blocks where contents will be ++ * visible after transaction commit must be on transaction's ++ * ordered data list. ++ */ ++ if (map->m_flags & EXT4_MAP_NEW && ++ !(map->m_flags & EXT4_MAP_UNWRITTEN) && ++ !(flags & EXT4_GET_BLOCKS_ZERO) && ++ ext4_should_order_data(inode)) { ++ ret = ext4_jbd2_file_inode(handle, inode); ++ if (ret) ++ return ret; ++ } + } + return retval; + } +@@ -1289,15 +1303,6 @@ static int ext4_write_end(struct file *file, + int i_size_changed = 0; + + trace_ext4_write_end(inode, pos, len, copied); +- if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { +- ret = ext4_jbd2_file_inode(handle, inode); +- if (ret) { +- unlock_page(page); +- put_page(page); +- goto errout; +- } +- } +- + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); diff --git a/series b/series index 5c56abb8..205daeeb 100644 --- a/series +++ b/series @@ -2,6 +2,8 @@ allow-resched-and-interruption-of-large-empty-directories +fix-data-exposure-after-a-crash + ########################################## # unstable patches #################################################### diff --git a/timestamps b/timestamps index 470387ef..be1c4e22 100755 --- a/timestamps +++ b/timestamps @@ -29,7 +29,8 @@ touch -d @1461372227 add-blkdiscard-ioctl touch -d @1461372231 block-dio-during-truncate touch -d @1461372301 delalloc-debug touch -d @1461372315 commit-as-soon-as-possible-after-log_start_commit -touch -d @1461372376 series touch -d @1461466207 allow-resched-and-interruption-of-large-empty-directories -touch -d @1461466207 status -touch -d @1461469044 timestamps +touch -d @1461469110 series +touch -d @1461469203 fix-data-exposure-after-a-crash +touch -d @1461469203 status +touch -d @1461469296 timestamps -- 2.11.4.GIT