3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner. Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head. After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
13 fs/jbd2/Makefile | 2 +-
14 fs/jbd2/jmap.c | 43 ++++++++++++++++++++++++++++++-----
15 fs/jbd2/journal.c | 12 +++++++++-
16 include/linux/jbd2.h | 6 ++++-
17 include/linux/jmap.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
18 5 files changed, 151 insertions(+), 23 deletions(-)
20 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
21 index a54f50b3a06e..b6a2dddcc0a7 100644
22 --- a/fs/jbd2/Makefile
23 +++ b/fs/jbd2/Makefile
25 obj-$(CONFIG_JBD2) += jbd2.o
27 jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
30 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
31 index 8c844f65eeaa..693b3e8d736c 100644
34 @@ -38,7 +38,7 @@ int jbd2_init_transaction_infos(journal_t *journal)
37 for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
38 - INIT_LIST_HEAD(&tis->buf[i].live_logblks);
39 + INIT_LIST_HEAD(&tis->buf[i].live_blks);
41 journal->j_transaction_infos = tis;
43 @@ -91,15 +91,26 @@ static int process_existing_mappings(journal_t *journal,
44 * We are either deleting the entry because it was revoked, or
45 * we are moving it to the live blocks list of this transaction.
46 * In either case, we remove it from its existing list.
47 + * However, before removing it we check to see if this is an
48 + * entry in the live blocks list of the tail transaction a
49 + * pointer to whom is cached by the cleaner and update the
50 + * cached pointer if so.
52 - list_del(&je->list);
53 + spin_lock(&journal->j_cleaner_ctx->pos_lock);
54 + if (je == journal->j_cleaner_ctx->pos) {
55 + journal->j_cleaner_ctx->pos = list_next_entry(je, list);
56 + trace_jbd2_jmap_printf1("updating pos to",
57 + (unsigned long long) journal->j_cleaner_ctx->pos);
59 + list_del(&je->list);
60 + spin_unlock(&journal->j_cleaner_ctx->pos_lock);
63 rb_erase(&je->rb_node, &journal->j_jmap);
64 kmem_cache_free(jbd2_jmap_cache, je);
66 trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
67 - fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
68 + fill_entry(je, &mappings[i], t_idx, &ti->live_blks);
72 @@ -161,8 +172,7 @@ static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
76 - fill_entry(new_entries[i], &mappings[i], t_idx,
78 + fill_entry(new_entries[i], &mappings[i], t_idx, &ti->live_blks);
79 rb_link_node(&new_entries[i]->rb_node, parent, p);
80 rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
81 trace_jbd2_jmap_insert(&mappings[i], t_idx);
82 @@ -189,7 +199,9 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
83 * We are possibly reusing space of an old transaction_info. The old
84 * transaction should not have any live blocks in it.
86 - BUG_ON(!list_empty(&ti->live_logblks));
87 + BUG_ON(!list_empty(&ti->live_blks));
89 + atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
91 write_lock(&journal->j_jmap_lock);
92 nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
93 @@ -432,12 +444,31 @@ int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
95 int jbd2_smr_journal_init(journal_t *journal)
97 + journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
99 + if (!journal->j_cleaner_ctx)
102 + journal->j_cleaner_ctx->journal = journal;
103 + journal->j_cleaner_ctx->pos = NULL;
104 + spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
105 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
106 + atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
107 + atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
108 + atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
109 + atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
110 + init_completion(&journal->j_cleaner_ctx->live_block_reads);
112 journal->j_jmap = RB_ROOT;
113 rwlock_init(&journal->j_jmap_lock);
115 return jbd2_init_transaction_infos(journal);
118 void jbd2_smr_journal_exit(journal_t *journal)
120 + atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
121 + flush_work(&journal->j_cleaner_ctx->work);
122 + kfree(journal->j_cleaner_ctx);
123 jbd2_free_transaction_infos(journal);
125 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
126 index 0cbfb7fdc45d..8e305aacef48 100644
127 --- a/fs/jbd2/journal.c
128 +++ b/fs/jbd2/journal.c
130 #include <asm/page.h>
132 #ifdef CONFIG_JBD2_DEBUG
133 -ushort jbd2_journal_enable_debug __read_mostly;
134 +ushort jbd2_journal_enable_debug __read_mostly = 1;
135 EXPORT_SYMBOL(jbd2_journal_enable_debug);
137 module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
138 @@ -227,6 +227,14 @@ static int kjournald2(void *arg)
141 wake_up(&journal->j_wait_done_commit);
143 + if (cleaning(journal) || low_on_space(journal)) {
144 + if (try_to_move_tail(journal) && high_on_space(journal))
145 + stop_cleaning(journal);
147 + start_cleaning(journal);
150 if (freezing(current)) {
152 * The simpler the better. Flushing journal isn't a
153 @@ -255,6 +263,8 @@ static int kjournald2(void *arg)
155 if (journal->j_flags & JBD2_UNMOUNT)
157 + if (cleaning_batch_complete(journal))
160 write_unlock(&journal->j_state_lock);
162 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
163 index 317efb491569..350d5d229b68 100644
164 --- a/include/linux/jbd2.h
165 +++ b/include/linux/jbd2.h
166 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
167 * @j_superblock: Second part of superblock buffer
168 * @j_map: A map from file system blocks to log blocks
169 * @j_transaction_infos: An array of information structures per live transaction
170 - * @j_map_lock: Protect j_jmap and j_transaction_infos
171 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
172 + * @j_cleaner_ctx: Cleaner state
173 * @j_format_version: Version of the superblock format
174 * @j_state_lock: Protect the various scalars in the journal
175 * @j_barrier_count: Number of processes waiting to create a barrier lock
176 @@ -820,6 +821,9 @@ struct journal_s
177 /* Protect j_jmap and j_transaction_infos */
178 rwlock_t j_jmap_lock;
180 + /* Cleaner state */
181 + struct cleaner_ctx *j_cleaner_ctx;
183 /* Version of the superblock format */
184 int j_format_version;
186 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
187 index d068358380b0..b734551ddb67 100644
188 --- a/include/linux/jmap.h
189 +++ b/include/linux/jmap.h
191 #include <linux/journal-head.h>
192 #include <linux/list.h>
193 #include <linux/circ_buf.h>
194 +#include <linux/completion.h>
197 + * Forward declaration for journal_t so that we don't get circular dependency
198 + * between jbd2.h and jmap.h
201 +typedef struct journal_s journal_t;
204 * Maximum number of transactions. This guides the size of the circular buffer
206 #define MAX_LIVE_TRANSACTIONS 65536
209 - * Forward declaration for journal_t so that we don't get circular dependency
210 - * between jbd2.h and jmap.h
213 -typedef struct journal_s journal_t;
216 * A mapping from file system block to log block.
219 @@ -79,14 +80,14 @@ struct transaction_info {
223 - * A list of live log blocks referenced in the RB-tree that belong to
224 - * this transaction. It is used during cleaning to locate live blocks
225 - * and migrate them to appropriate location. If this list is empty,
226 - * then the transaction does not contain any live blocks and we can
227 - * reuse its space. If this list is not empty, then we can quickly
228 - * locate all the live blocks in this transaction.
229 + * A list of live blocks referenced in the RB-tree that belong to this
230 + * transaction. It is used during cleaning to locate live blocks and
231 + * migrate them to appropriate location. If this list is empty, then
232 + * the transaction does not contain any live blocks and we can reuse its
233 + * space. If this list is not empty, then we can quickly locate all the
234 + * live blocks in this transaction.
236 - struct list_head live_logblks;
237 + struct list_head live_blks;
241 @@ -126,4 +127,86 @@ extern void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags, int nr,
242 extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
246 + * Cleaner stuff is below.
250 + * Number of blocks to read at once, for cleaning.
252 +#define CLEANER_BATCH_SIZE 16
255 + * Context structure for the cleaner.
257 +struct cleaner_ctx {
259 + * We set to true once we drop below low watermark and it stays so until
260 + * we rise above the high watermark. It is accessed by the commit
261 + * thread and the foreground kernel threads during the journal
262 + * destruction, therefore it is atomic.
267 + * We clean in batches of blocks. This flag indicates if we are
268 + * currently cleaning a batch. It is accessed by the commit thread and
269 + * the cleaner thread, therefore it is atomic.
271 + atomic_t batch_in_progress;
274 + * We find live blocks to clean from the live blocks list of the
275 + * transaction at the tail. This list can be larger than our batch size
276 + * and we may need several attempts to process it. We cache the
277 + * position of the next entry to start from in |pos|. Since cleaner
278 + * thread can run concurrently with the commit thread that can modify
279 + * the live blocks list of the transaction at the tail (for example, if
280 + * it needs to drop a revoked entry or if |pos| points to an entry that
281 + * has been updated and should move from the live blocks list of the
282 + * transaction at the tail to the live blocks list of current
283 + * transaction) we protect |pos| with |pos_lock|.
285 + struct jmap_entry *pos;
286 + spinlock_t pos_lock;
289 + * Live block mappings for the blocks that we copy in a batch.
291 + struct blk_mapping mappings[CLEANER_BATCH_SIZE];
294 + * Buffer heads for the live blocks read in a batch.
296 + struct buffer_head *bhs[CLEANER_BATCH_SIZE];
299 + * Number of pending reads in a batch. Every submitted read increments
300 + * it and every completed read decrements it.
302 + atomic_t nr_pending_reads;
305 + * The cleaner thread sleeps on this condition variable until the last
306 + * completed read wakes the up the cleaner thread.
308 + struct completion live_block_reads;
310 + /* TODO: temporary for debugging, remove once done. */
311 + atomic_t nr_txns_committed;
312 + atomic_t nr_txns_cleaned;
314 + journal_t *journal;
315 + struct work_struct work;
318 +extern int low_on_space(journal_t *journal);
319 +extern int high_on_space(journal_t *journal);
320 +extern bool cleaning(journal_t *journal);
321 +extern void stop_cleaning(journal_t *journal);
322 +extern void start_cleaning(journal_t *journal);
323 +extern void clean_next_batch(journal_t *journal);
324 +extern bool cleaning_batch_complete(journal_t *journal);
325 +extern bool try_to_move_tail(journal_t *journal);