add patch jbd2-dont-double-bump-transaction-number to unstable
[ext4-patch-queue.git] / cleaner
blob67e0e2e2b9d53259ec9ff2a95dd07562828a866f
1 Introduce cleaner
3 From: Abutalib Aghayev <agayev@cs.cmu.edu>
5 An experimental cleaner.  Copy the live blocks from the transaction at the
6 tail in batches to the transaction at the head.  After a commit ends, check
7 if free space is below watermark and start cleaning until free space is
8 above high watermark.
10 Signed-off-by: Abutalib Aghayev <agayev@cs.cmu.edu>
12 ---
13  fs/jbd2/Makefile     |   2 +-
14  fs/jbd2/jmap.c       |  43 ++++++++++++++++++++++++++++++-----
15  fs/jbd2/journal.c    |  12 +++++++++-
16  include/linux/jbd2.h |   6 ++++-
17  include/linux/jmap.h | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
18  5 files changed, 151 insertions(+), 23 deletions(-)
20 diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
21 index a54f50b3a06e..b6a2dddcc0a7 100644
22 --- a/fs/jbd2/Makefile
23 +++ b/fs/jbd2/Makefile
24 @@ -5,4 +5,4 @@
25  obj-$(CONFIG_JBD2) += jbd2.o
27  jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o \
28 -               jmap.o
29 +               jmap.o cleaner.o
30 diff --git a/fs/jbd2/jmap.c b/fs/jbd2/jmap.c
31 index 8c844f65eeaa..693b3e8d736c 100644
32 --- a/fs/jbd2/jmap.c
33 +++ b/fs/jbd2/jmap.c
34 @@ -38,7 +38,7 @@ int jbd2_init_transaction_infos(journal_t *journal)
35         }
37         for (i = 0; i < MAX_LIVE_TRANSACTIONS; ++i)
38 -               INIT_LIST_HEAD(&tis->buf[i].live_logblks);
39 +               INIT_LIST_HEAD(&tis->buf[i].live_blks);
41         journal->j_transaction_infos = tis;
42         return 0;
43 @@ -91,15 +91,26 @@ static int process_existing_mappings(journal_t *journal,
44                  * We are either deleting the entry because it was revoked, or
45                  * we are moving it to the live blocks list of this transaction.
46                  * In either case, we remove it from its existing list.
47 +                * However, before removing it we check to see if this is an
48 +                * entry in the live blocks list of the tail transaction a
49 +                * pointer to whom is cached by the cleaner and update the
50 +                * cached pointer if so.
51                  */
52 -               list_del(&je->list);
53 +               spin_lock(&journal->j_cleaner_ctx->pos_lock);
54 +               if (je == journal->j_cleaner_ctx->pos) {
55 +                       journal->j_cleaner_ctx->pos = list_next_entry(je, list);
56 +                       trace_jbd2_jmap_printf1("updating pos to",
57 +                                               (unsigned long long) journal->j_cleaner_ctx->pos);
58 +               }
59 +               list_del(&je->list);
60 +               spin_unlock(&journal->j_cleaner_ctx->pos_lock);
62                 if (je->revoked) {
63                         rb_erase(&je->rb_node, &journal->j_jmap);
64                         kmem_cache_free(jbd2_jmap_cache, je);
65                 } else {
66                         trace_jbd2_jmap_replace(je, &mappings[i], t_idx);
67 -                       fill_entry(je, &mappings[i], t_idx, &ti->live_logblks);
68 +                       fill_entry(je, &mappings[i], t_idx, &ti->live_blks);
69                 }
70         }
71         return nr_new;
72 @@ -161,8 +172,7 @@ static void add_new_mappings(journal_t *journal, struct transaction_info *ti,
73                         else
74                                 BUG_ON(1);
75                 }
76 -               fill_entry(new_entries[i], &mappings[i], t_idx,
77 -                       &ti->live_logblks);
78 +               fill_entry(new_entries[i], &mappings[i], t_idx, &ti->live_blks);
79                 rb_link_node(&new_entries[i]->rb_node, parent, p);
80                 rb_insert_color(&new_entries[i]->rb_node, &journal->j_jmap);
81                 trace_jbd2_jmap_insert(&mappings[i], t_idx);
82 @@ -189,7 +199,9 @@ int jbd2_transaction_infos_add(journal_t *journal, transaction_t *transaction,
83          * We are possibly reusing space of an old transaction_info.  The old
84          * transaction should not have any live blocks in it.
85          */
86 -       BUG_ON(!list_empty(&ti->live_logblks));
87 +       BUG_ON(!list_empty(&ti->live_blks));
89 +       atomic_inc(&journal->j_cleaner_ctx->nr_txns_committed);
91         write_lock(&journal->j_jmap_lock);
92         nr_new = process_existing_mappings(journal, ti, t_idx, mappings,
93 @@ -432,12 +444,31 @@ int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
95  int jbd2_smr_journal_init(journal_t *journal)
96  {
97 +       journal->j_cleaner_ctx = kzalloc(sizeof(struct cleaner_ctx),
98 +                                       GFP_KERNEL);
99 +       if (!journal->j_cleaner_ctx)
100 +               return -ENOMEM;
102 +       journal->j_cleaner_ctx->journal = journal;
103 +       journal->j_cleaner_ctx->pos = NULL;
104 +       spin_lock_init(&journal->j_cleaner_ctx->pos_lock);
105 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
106 +       atomic_set(&journal->j_cleaner_ctx->batch_in_progress, 0);
107 +       atomic_set(&journal->j_cleaner_ctx->nr_pending_reads, 0);
108 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_committed, 0);
109 +       atomic_set(&journal->j_cleaner_ctx->nr_txns_cleaned, 0);
110 +       init_completion(&journal->j_cleaner_ctx->live_block_reads);
112         journal->j_jmap = RB_ROOT;
113         rwlock_init(&journal->j_jmap_lock);
115         return jbd2_init_transaction_infos(journal);
118  void jbd2_smr_journal_exit(journal_t *journal)
120 +       atomic_set(&journal->j_cleaner_ctx->cleaning, 0);
121 +       flush_work(&journal->j_cleaner_ctx->work);
122 +       kfree(journal->j_cleaner_ctx);
123         jbd2_free_transaction_infos(journal);
125 diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
126 index 0cbfb7fdc45d..8e305aacef48 100644
127 --- a/fs/jbd2/journal.c
128 +++ b/fs/jbd2/journal.c
129 @@ -51,7 +51,7 @@
130  #include <asm/page.h>
132  #ifdef CONFIG_JBD2_DEBUG
133 -ushort jbd2_journal_enable_debug __read_mostly;
134 +ushort jbd2_journal_enable_debug __read_mostly = 1;
135  EXPORT_SYMBOL(jbd2_journal_enable_debug);
137  module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
138 @@ -227,6 +227,14 @@ static int kjournald2(void *arg)
139         }
141         wake_up(&journal->j_wait_done_commit);
143 +       if (cleaning(journal) || low_on_space(journal)) {
144 +               if (try_to_move_tail(journal) && high_on_space(journal))
145 +                       stop_cleaning(journal);
146 +               else
147 +                       start_cleaning(journal);
148 +       }
150         if (freezing(current)) {
151                 /*
152                  * The simpler the better. Flushing journal isn't a
153 @@ -255,6 +263,8 @@ static int kjournald2(void *arg)
154                         should_sleep = 0;
155                 if (journal->j_flags & JBD2_UNMOUNT)
156                         should_sleep = 0;
157 +               if (cleaning_batch_complete(journal))
158 +                       should_sleep = 0;
159                 if (should_sleep) {
160                         write_unlock(&journal->j_state_lock);
161                         schedule();
162 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
163 index 317efb491569..350d5d229b68 100644
164 --- a/include/linux/jbd2.h
165 +++ b/include/linux/jbd2.h
166 @@ -735,7 +735,8 @@ jbd2_time_diff(unsigned long start, unsigned long end)
167   * @j_superblock: Second part of superblock buffer
168   * @j_map: A map from file system blocks to log blocks
169   * @j_transaction_infos: An array of information structures per live transaction
170 - * @j_map_lock: Protect j_jmap and j_transaction_infos
171 + * @j_jmap_lock: Protect j_jmap and j_transaction_infos
172 + * @j_cleaner_ctx: Cleaner state
173   * @j_format_version: Version of the superblock format
174   * @j_state_lock: Protect the various scalars in the journal
175   * @j_barrier_count:  Number of processes waiting to create a barrier lock
176 @@ -820,6 +821,9 @@ struct journal_s
177         /* Protect j_jmap and j_transaction_infos */
178         rwlock_t                j_jmap_lock;
180 +       /* Cleaner state */
181 +       struct cleaner_ctx      *j_cleaner_ctx;
183         /* Version of the superblock format */
184         int                     j_format_version;
186 diff --git a/include/linux/jmap.h b/include/linux/jmap.h
187 index d068358380b0..b734551ddb67 100644
188 --- a/include/linux/jmap.h
189 +++ b/include/linux/jmap.h
190 @@ -5,6 +5,14 @@
191  #include <linux/journal-head.h>
192  #include <linux/list.h>
193  #include <linux/circ_buf.h>
194 +#include <linux/completion.h>
197 + * Forward declaration for journal_t so that we don't get circular dependency
198 + * between jbd2.h and jmap.h
199 + */
200 +struct journal_s;
201 +typedef struct journal_s journal_t;
203  /*
204   * Maximum number of transactions.  This guides the size of the circular buffer
205 @@ -17,13 +25,6 @@
206  #define MAX_LIVE_TRANSACTIONS 65536
208  /*
209 - * Forward declaration for journal_t so that we don't get circular dependency
210 - * between jbd2.h and jmap.h
211 - */
212 -struct journal_s;
213 -typedef struct journal_s journal_t;
216   * A mapping from file system block to log block.
217   */
218  struct blk_mapping {
219 @@ -79,14 +80,14 @@ struct transaction_info {
220         sector_t offset;
222         /*
223 -        * A list of live log blocks referenced in the RB-tree that belong to
224 -        * this transaction.  It is used during cleaning to locate live blocks
225 -        * and migrate them to appropriate location.  If this list is empty,
226 -        * then the transaction does not contain any live blocks and we can
227 -        * reuse its space.  If this list is not empty, then we can quickly
228 -        * locate all the live blocks in this transaction.
229 +        * A list of live blocks referenced in the RB-tree that belong to this
230 +        * transaction.  It is used during cleaning to locate live blocks and
231 +        * migrate them to appropriate location.  If this list is empty, then
232 +        * the transaction does not contain any live blocks and we can reuse its
233 +        * space.  If this list is not empty, then we can quickly locate all the
234 +        * live blocks in this transaction.
235          */
236 -       struct list_head live_logblks;
237 +       struct list_head live_blks;
238  };
240  /*
241 @@ -126,4 +127,86 @@ extern void jbd2_ll_rw_block(journal_t *journal, int rw, int op_flags, int nr,
242  extern int jbd2_bh_submit_read(journal_t *journal, struct buffer_head *bh,
243                                const char *func);
246 + * Cleaner stuff is below.
247 + */
250 + * Number of blocks to read at once, for cleaning.
251 + */
252 +#define CLEANER_BATCH_SIZE 16
255 + * Context structure for the cleaner.
256 + */
257 +struct cleaner_ctx {
258 +       /*
259 +        * We set to true once we drop below low watermark and it stays so until
260 +        * we rise above the high watermark.  It is accessed by the commit
261 +        * thread and the foreground kernel threads during the journal
262 +        * destruction, therefore it is atomic.
263 +        */
264 +       atomic_t cleaning;
266 +       /*
267 +        * We clean in batches of blocks.  This flag indicates if we are
268 +        * currently cleaning a batch.  It is accessed by the commit thread and
269 +        * the cleaner thread, therefore it is atomic.
270 +        */
271 +       atomic_t batch_in_progress;
273 +       /*
274 +        * We find live blocks to clean from the live blocks list of the
275 +        * transaction at the tail.  This list can be larger than our batch size
276 +        * and we may need several attempts to process it.  We cache the
277 +        * position of the next entry to start from in |pos|.  Since cleaner
278 +        * thread can run concurrently with the commit thread that can modify
279 +        * the live blocks list of the transaction at the tail (for example, if
280 +        * it needs to drop a revoked entry or if |pos| points to an entry that
281 +        * has been updated and should move from the live blocks list of the
282 +        * transaction at the tail to the live blocks list of current
283 +        * transaction) we protect |pos| with |pos_lock|.
284 +        */
285 +       struct jmap_entry *pos;
286 +       spinlock_t pos_lock;
288 +       /*
289 +        * Live block mappings for the blocks that we copy in a batch.
290 +        */
291 +       struct blk_mapping mappings[CLEANER_BATCH_SIZE];
293 +       /*
294 +        * Buffer heads for the live blocks read in a batch.
295 +        */
296 +       struct buffer_head *bhs[CLEANER_BATCH_SIZE];
298 +       /*
299 +        * Number of pending reads in a batch.  Every submitted read increments
300 +        * it and every completed read decrements it.
301 +        */
302 +       atomic_t nr_pending_reads;
304 +       /*
305 +        * The cleaner thread sleeps on this condition variable until the last
306 +        * completed read wakes the up the cleaner thread.
307 +        */
308 +       struct completion live_block_reads;
310 +       /* TODO: temporary for debugging, remove once done. */
311 +       atomic_t nr_txns_committed;
312 +       atomic_t nr_txns_cleaned;
314 +       journal_t *journal;
315 +       struct work_struct work;
318 +extern int low_on_space(journal_t *journal);
319 +extern int high_on_space(journal_t *journal);
320 +extern bool cleaning(journal_t *journal);
321 +extern void stop_cleaning(journal_t *journal);
322 +extern void start_cleaning(journal_t *journal);
323 +extern void clean_next_batch(journal_t *journal);
324 +extern bool cleaning_batch_complete(journal_t *journal);
325 +extern bool try_to_move_tail(journal_t *journal);
327  #endif