2 * fs/logfs/gc.c - garbage collection code
4 * As should be obvious for Linux kernel code, license is GPLv2
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
9 #include <linux/sched.h>
10 #include <linux/slab.h>
13 * Wear leveling needs to kick in when the difference between low erase
14 * counts and high erase counts gets too big. A good value for "too big"
15 * may be somewhat below 10% of maximum erase count for the device.
16 * Why not 397, to pick a nice round number with no specific meaning? :)
18 * WL_RATELIMIT is the minimum time between two wear level events. A huge
19 * number of segments may fulfil the requirements for wear leveling at the
20 * same time. If that happens we don't want to cause a latency from hell,
21 * but just gently pick one segment every so often and minimize overhead.
24 #define WL_RATELIMIT 100
25 #define MAX_OBJ_ALIASES 2600
26 #define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
27 #define LIST_SIZE 64 /* base size of candidate lists */
28 #define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
29 #define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
31 static int no_free_segments(struct super_block
*sb
)
33 struct logfs_super
*super
= logfs_super(sb
);
35 return super
->s_free_list
.count
;
38 /* journal has distance -1, top-most ifile layer distance 0 */
39 static u8
root_distance(struct super_block
*sb
, gc_level_t __gc_level
)
41 struct logfs_super
*super
= logfs_super(sb
);
42 u8 gc_level
= (__force u8
)__gc_level
;
45 case 0: /* fall through */
46 case 1: /* fall through */
47 case 2: /* fall through */
49 /* file data or indirect blocks */
50 return super
->s_ifile_levels
+ super
->s_iblock_levels
- gc_level
;
51 case 6: /* fall through */
52 case 7: /* fall through */
53 case 8: /* fall through */
55 /* inode file data or indirect blocks */
56 return super
->s_ifile_levels
- (gc_level
- 6);
58 printk(KERN_ERR
"LOGFS: segment of unknown level %x found\n",
61 return super
->s_ifile_levels
+ super
->s_iblock_levels
;
65 static int segment_is_reserved(struct super_block
*sb
, u32 segno
)
67 struct logfs_super
*super
= logfs_super(sb
);
68 struct logfs_area
*area
;
72 /* Some segments are reserved. Just pretend they were all valid */
73 reserved
= btree_lookup32(&super
->s_reserved_segments
, segno
);
77 /* Currently open segments */
79 area
= super
->s_area
[i
];
80 if (area
->a_is_open
&& area
->a_segno
== segno
)
87 static void logfs_mark_segment_bad(struct super_block
*sb
, u32 segno
)
93 * Returns the bytes consumed by valid objects in this segment. Object headers
94 * are counted, the segment header is not.
96 static u32
logfs_valid_bytes(struct super_block
*sb
, u32 segno
, u32
*ec
,
99 struct logfs_segment_entry se
;
102 logfs_get_segment_entry(sb
, segno
, &se
);
103 if (se
.ec_level
== cpu_to_be32(BADSEG
) ||
104 se
.valid
== cpu_to_be32(RESERVED
))
107 ec_level
= be32_to_cpu(se
.ec_level
);
109 *gc_level
= GC_LEVEL(ec_level
& 0xf);
110 return be32_to_cpu(se
.valid
);
113 static void logfs_cleanse_block(struct super_block
*sb
, u64 ofs
, u64 ino
,
114 u64 bix
, gc_level_t gc_level
)
119 inode
= logfs_safe_iget(sb
, ino
, &cookie
);
120 err
= logfs_rewrite_block(inode
, bix
, ofs
, gc_level
, 0);
122 logfs_safe_iput(inode
, cookie
);
125 static u32
logfs_gc_segment(struct super_block
*sb
, u32 segno
)
127 struct logfs_super
*super
= logfs_super(sb
);
128 struct logfs_segment_header sh
;
129 struct logfs_object_header oh
;
131 u32 seg_ofs
, logical_segno
, cleaned
= 0;
135 LOGFS_BUG_ON(segment_is_reserved(sb
, segno
), sb
);
137 btree_insert32(&super
->s_reserved_segments
, segno
, (void *)1, GFP_NOFS
);
138 err
= wbuf_read(sb
, dev_ofs(sb
, segno
, 0), sizeof(sh
), &sh
);
140 gc_level
= GC_LEVEL(sh
.level
);
141 logical_segno
= be32_to_cpu(sh
.segno
);
142 if (sh
.crc
!= logfs_crc32(&sh
, sizeof(sh
), 4)) {
143 logfs_mark_segment_bad(sb
, segno
);
148 for (seg_ofs
= LOGFS_SEGMENT_HEADERSIZE
;
149 seg_ofs
+ sizeof(oh
) < super
->s_segsize
; ) {
150 ofs
= dev_ofs(sb
, logical_segno
, seg_ofs
);
151 err
= wbuf_read(sb
, dev_ofs(sb
, segno
, seg_ofs
), sizeof(oh
),
155 if (!memchr_inv(&oh
, 0xff, sizeof(oh
)))
158 if (oh
.crc
!= logfs_crc32(&oh
, sizeof(oh
) - 4, 4)) {
159 logfs_mark_segment_bad(sb
, segno
);
160 cleaned
= super
->s_segsize
- 1;
164 ino
= be64_to_cpu(oh
.ino
);
165 bix
= be64_to_cpu(oh
.bix
);
166 len
= sizeof(oh
) + be16_to_cpu(oh
.len
);
167 valid
= logfs_is_valid_block(sb
, ofs
, ino
, bix
, gc_level
);
169 logfs_cleanse_block(sb
, ofs
, ino
, bix
, gc_level
);
171 } else if (valid
== 2) {
172 /* Will be invalid upon journal commit */
178 btree_remove32(&super
->s_reserved_segments
, segno
);
182 static struct gc_candidate
*add_list(struct gc_candidate
*cand
,
183 struct candidate_list
*list
)
185 struct rb_node
**p
= &list
->rb_tree
.rb_node
;
186 struct rb_node
*parent
= NULL
;
187 struct gc_candidate
*cur
;
193 cur
= rb_entry(parent
, struct gc_candidate
, rb_node
);
195 if (list
->sort_by_ec
)
196 comp
= cand
->erase_count
< cur
->erase_count
;
198 comp
= cand
->valid
< cur
->valid
;
201 p
= &parent
->rb_left
;
203 p
= &parent
->rb_right
;
205 rb_link_node(&cand
->rb_node
, parent
, p
);
206 rb_insert_color(&cand
->rb_node
, &list
->rb_tree
);
208 if (list
->count
<= list
->maxcount
) {
212 cand
= rb_entry(rb_last(&list
->rb_tree
), struct gc_candidate
, rb_node
);
213 rb_erase(&cand
->rb_node
, &list
->rb_tree
);
218 static void remove_from_list(struct gc_candidate
*cand
)
220 struct candidate_list
*list
= cand
->list
;
222 rb_erase(&cand
->rb_node
, &list
->rb_tree
);
226 static void free_candidate(struct super_block
*sb
, struct gc_candidate
*cand
)
228 struct logfs_super
*super
= logfs_super(sb
);
230 btree_remove32(&super
->s_cand_tree
, cand
->segno
);
234 u32
get_best_cand(struct super_block
*sb
, struct candidate_list
*list
, u32
*ec
)
236 struct gc_candidate
*cand
;
239 BUG_ON(list
->count
== 0);
241 cand
= rb_entry(rb_first(&list
->rb_tree
), struct gc_candidate
, rb_node
);
242 remove_from_list(cand
);
245 *ec
= cand
->erase_count
;
246 free_candidate(sb
, cand
);
251 * We have several lists to manage segments with. The reserve_list is used to
252 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
254 * The free_list contains free segments for normal usage. It usually gets the
255 * second pick after the reserve_list. But when the free_list is running short
256 * it is more important to keep the free_list full than to keep a reserve.
258 * Segments that are not free are put onto a per-level low_list. If we have
259 * to run garbage collection, we pick a candidate from there. All segments on
260 * those lists should have at least some free space so GC will make progress.
262 * And last we have the ec_list, which is used to pick segments for wear
265 * If all appropriate lists are full, we simply free the candidate and forget
266 * about that segment for a while. We have better candidates for each purpose.
268 static void __add_candidate(struct super_block
*sb
, struct gc_candidate
*cand
)
270 struct logfs_super
*super
= logfs_super(sb
);
271 u32 full
= super
->s_segsize
- LOGFS_SEGMENT_RESERVE
;
273 if (cand
->valid
== 0) {
274 /* 100% free segments */
275 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
276 cand
->segno
, cand
->erase_count
,
277 dev_ofs(sb
, cand
->segno
, 0));
278 cand
= add_list(cand
, &super
->s_reserve_list
);
280 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
281 cand
->segno
, cand
->erase_count
,
282 dev_ofs(sb
, cand
->segno
, 0));
283 cand
= add_list(cand
, &super
->s_free_list
);
286 /* good candidates for Garbage Collection */
287 if (cand
->valid
< full
)
288 cand
= add_list(cand
, &super
->s_low_list
[cand
->dist
]);
289 /* good candidates for wear leveling,
290 * segments that were recently written get ignored */
292 cand
= add_list(cand
, &super
->s_ec_list
);
295 free_candidate(sb
, cand
);
298 static int add_candidate(struct super_block
*sb
, u32 segno
, u32 valid
, u32 ec
,
301 struct logfs_super
*super
= logfs_super(sb
);
302 struct gc_candidate
*cand
;
304 cand
= kmalloc(sizeof(*cand
), GFP_NOFS
);
310 cand
->erase_count
= ec
;
313 btree_insert32(&super
->s_cand_tree
, segno
, cand
, GFP_NOFS
);
314 __add_candidate(sb
, cand
);
318 static void remove_segment_from_lists(struct super_block
*sb
, u32 segno
)
320 struct logfs_super
*super
= logfs_super(sb
);
321 struct gc_candidate
*cand
;
323 cand
= btree_lookup32(&super
->s_cand_tree
, segno
);
325 remove_from_list(cand
);
326 free_candidate(sb
, cand
);
330 static void scan_segment(struct super_block
*sb
, u32 segno
)
333 gc_level_t gc_level
= 0;
336 if (segment_is_reserved(sb
, segno
))
339 remove_segment_from_lists(sb
, segno
);
340 valid
= logfs_valid_bytes(sb
, segno
, &ec
, &gc_level
);
341 if (valid
== RESERVED
)
344 dist
= root_distance(sb
, gc_level
);
345 add_candidate(sb
, segno
, valid
, ec
, dist
);
348 static struct gc_candidate
*first_in_list(struct candidate_list
*list
)
350 if (list
->count
== 0)
352 return rb_entry(rb_first(&list
->rb_tree
), struct gc_candidate
, rb_node
);
356 * Find the best segment for garbage collection. Main criterion is
357 * the segment requiring the least effort to clean. Secondary
358 * criterion is to GC on the lowest level available.
360 * So we search the least effort segment on the lowest level first,
361 * then move up and pick another segment iff is requires significantly
362 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
364 static struct gc_candidate
*get_candidate(struct super_block
*sb
)
366 struct logfs_super
*super
= logfs_super(sb
);
368 struct gc_candidate
*cand
= NULL
, *this;
370 max_dist
= min(no_free_segments(sb
), LOGFS_NO_AREAS
- 1);
372 for (i
= max_dist
; i
>= 0; i
--) {
373 this = first_in_list(&super
->s_low_list
[i
]);
378 if (this->valid
+ LOGFS_MAX_OBJECTSIZE
<= cand
->valid
)
384 static int __logfs_gc_once(struct super_block
*sb
, struct gc_candidate
*cand
)
386 struct logfs_super
*super
= logfs_super(sb
);
388 u32 cleaned
, valid
, segno
, ec
;
392 log_gc("GC attempted, but no candidate found\n");
398 valid
= logfs_valid_bytes(sb
, segno
, &ec
, &gc_level
);
399 free_candidate(sb
, cand
);
400 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
401 segno
, (u64
)segno
<< super
->s_segshift
,
402 dist
, no_free_segments(sb
), valid
,
403 super
->s_free_bytes
);
404 cleaned
= logfs_gc_segment(sb
, segno
);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno
,
407 BUG_ON(cleaned
!= valid
);
411 static int logfs_gc_once(struct super_block
*sb
)
413 struct gc_candidate
*cand
;
415 cand
= get_candidate(sb
);
417 remove_from_list(cand
);
418 return __logfs_gc_once(sb
, cand
);
421 /* returns 1 if a wrap occurs, 0 otherwise */
422 static int logfs_scan_some(struct super_block
*sb
)
424 struct logfs_super
*super
= logfs_super(sb
);
428 segno
= super
->s_sweeper
;
429 for (i
= SCAN_RATIO
; i
> 0; i
--) {
431 if (segno
>= super
->s_no_segs
) {
434 /* Break out of the loop. We want to read a single
435 * block from the segment size on next invocation if
436 * SCAN_RATIO is set to match block size
441 scan_segment(sb
, segno
);
443 super
->s_sweeper
= segno
;
448 * In principle, this function should loop forever, looking for GC candidates
449 * and moving data. LogFS is designed in such a way that this loop is
450 * guaranteed to terminate.
452 * Limiting the loop to some iterations serves purely to catch cases when
453 * these guarantees have failed. An actual endless loop is an obvious bug
454 * and should be reported as such.
456 static void __logfs_gc_pass(struct super_block
*sb
, int target
)
458 struct logfs_super
*super
= logfs_super(sb
);
459 struct logfs_block
*block
;
460 int round
, progress
, last_progress
= 0;
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
467 if (super
->s_shadow_tree
.no_shadowed_segments
>= MAX_OBJ_ALIASES
)
468 logfs_write_anchor(sb
);
470 if (no_free_segments(sb
) >= target
&&
471 super
->s_no_object_aliases
< MAX_OBJ_ALIASES
)
474 log_gc("__logfs_gc_pass(%x)\n", target
);
475 for (round
= 0; round
< SCAN_ROUNDS
; ) {
476 if (no_free_segments(sb
) >= target
)
479 /* Sync in-memory state with on-medium state in case they
481 logfs_write_anchor(sb
);
482 round
+= logfs_scan_some(sb
);
483 if (no_free_segments(sb
) >= target
)
485 progress
= logfs_gc_once(sb
);
487 last_progress
= round
;
488 else if (round
- last_progress
> 2)
493 * The goto logic is nasty, I just don't know a better way to
494 * code it. GC is supposed to ensure two things:
495 * 1. Enough free segments are available.
496 * 2. The number of aliases is bounded.
497 * When 1. is achieved, we take a look at 2. and write back
498 * some alias-containing blocks, if necessary. However, after
499 * each such write we need to go back to 1., as writes can
500 * consume free segments.
503 if (super
->s_no_object_aliases
< MAX_OBJ_ALIASES
)
505 if (list_empty(&super
->s_object_alias
)) {
506 /* All aliases are still in btree */
509 log_gc("Write back one alias\n");
510 block
= list_entry(super
->s_object_alias
.next
,
511 struct logfs_block
, alias_list
);
512 block
->ops
->write_block(block
);
514 * To round off the nasty goto logic, we reset round here. It
515 * is a safety-net for GC not making any progress and limited
516 * to something reasonably small. If incremented it for every
517 * single alias, the loop could terminate rather quickly.
524 static int wl_ratelimit(struct super_block
*sb
, u64
*next_event
)
526 struct logfs_super
*super
= logfs_super(sb
);
528 if (*next_event
< super
->s_gec
) {
529 *next_event
= super
->s_gec
+ WL_RATELIMIT
;
535 static void logfs_wl_pass(struct super_block
*sb
)
537 struct logfs_super
*super
= logfs_super(sb
);
538 struct gc_candidate
*wl_cand
, *free_cand
;
540 if (wl_ratelimit(sb
, &super
->s_wl_gec_ostore
))
543 wl_cand
= first_in_list(&super
->s_ec_list
);
546 free_cand
= first_in_list(&super
->s_free_list
);
550 if (wl_cand
->erase_count
< free_cand
->erase_count
+ WL_DELTA
) {
551 remove_from_list(wl_cand
);
552 __logfs_gc_once(sb
, wl_cand
);
557 * The journal needs wear leveling as well. But moving the journal is an
558 * expensive operation so we try to avoid it as much as possible. And if we
559 * have to do it, we move the whole journal, not individual segments.
561 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
562 * calculations. First we check whether moving the journal would be a
563 * significant improvement. That means that a) the current journal segments
564 * have more wear than the future journal segments and b) the current journal
565 * segments have more wear than normal ostore segments.
566 * Rationale for b) is that we don't have to move the journal if it is aging
567 * less than the ostore, even if the reserve segments age even less (they are
568 * excluded from wear leveling, after all).
569 * Next we check that the superblocks have less wear than the journal. Since
570 * moving the journal requires writing the superblocks, we have to protect the
571 * superblocks even more than the journal.
573 * Also we double the acceptable wear difference, compared to ostore wear
574 * leveling. Journal data is read and rewritten rapidly, comparatively. So
575 * soft errors have much less time to accumulate and we allow the journal to
576 * be a bit worse than the ostore.
578 static void logfs_journal_wl_pass(struct super_block
*sb
)
580 struct logfs_super
*super
= logfs_super(sb
);
581 struct gc_candidate
*cand
;
582 u32 min_journal_ec
= -1, max_reserve_ec
= 0;
585 if (wl_ratelimit(sb
, &super
->s_wl_gec_journal
))
588 if (super
->s_reserve_list
.count
< super
->s_no_journal_segs
) {
589 /* Reserve is not full enough to move complete journal */
594 if (super
->s_journal_seg
[i
])
595 min_journal_ec
= min(min_journal_ec
,
596 super
->s_journal_ec
[i
]);
597 cand
= rb_entry(rb_first(&super
->s_free_list
.rb_tree
),
598 struct gc_candidate
, rb_node
);
599 max_reserve_ec
= cand
->erase_count
;
600 for (i
= 0; i
< 2; i
++) {
601 struct logfs_segment_entry se
;
602 u32 segno
= seg_no(sb
, super
->s_sb_ofs
[i
]);
605 logfs_get_segment_entry(sb
, segno
, &se
);
606 ec
= be32_to_cpu(se
.ec_level
) >> 4;
607 max_reserve_ec
= max(max_reserve_ec
, ec
);
610 if (min_journal_ec
> max_reserve_ec
+ 2 * WL_DELTA
) {
611 do_logfs_journal_wl_pass(sb
);
615 void logfs_gc_pass(struct super_block
*sb
)
617 struct logfs_super
*super
= logfs_super(sb
);
619 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
620 /* Write journal before free space is getting saturated with dirty
623 if (super
->s_dirty_used_bytes
+ super
->s_dirty_free_bytes
624 + LOGFS_MAX_OBJECTSIZE
>= super
->s_free_bytes
)
625 logfs_write_anchor(sb
);
626 __logfs_gc_pass(sb
, super
->s_total_levels
);
628 logfs_journal_wl_pass(sb
);
631 static int check_area(struct super_block
*sb
, int i
)
633 struct logfs_super
*super
= logfs_super(sb
);
634 struct logfs_area
*area
= super
->s_area
[i
];
636 u32 cleaned
, valid
, ec
;
637 u32 segno
= area
->a_segno
;
638 u64 ofs
= dev_ofs(sb
, area
->a_segno
, area
->a_written_bytes
);
640 if (!area
->a_is_open
)
643 if (super
->s_devops
->can_write_buf(sb
, ofs
) == 0)
646 printk(KERN_INFO
"LogFS: Possibly incomplete write at %llx\n", ofs
);
648 * The device cannot write back the write buffer. Most likely the
649 * wbuf was already written out and the system crashed at some point
650 * before the journal commit happened. In that case we wouldn't have
651 * to do anything. But if the crash happened before the wbuf was
652 * written out correctly, we must GC this segment. So assume the
653 * worst and always do the GC run.
656 valid
= logfs_valid_bytes(sb
, segno
, &ec
, &gc_level
);
657 cleaned
= logfs_gc_segment(sb
, segno
);
658 if (cleaned
!= valid
)
663 int logfs_check_areas(struct super_block
*sb
)
668 err
= check_area(sb
, i
);
675 static void logfs_init_candlist(struct candidate_list
*list
, int maxcount
,
679 list
->maxcount
= maxcount
;
680 list
->sort_by_ec
= sort_by_ec
;
681 list
->rb_tree
= RB_ROOT
;
684 int logfs_init_gc(struct super_block
*sb
)
686 struct logfs_super
*super
= logfs_super(sb
);
689 btree_init_mempool32(&super
->s_cand_tree
, super
->s_btree_pool
);
690 logfs_init_candlist(&super
->s_free_list
, LIST_SIZE
+ SCAN_RATIO
, 1);
691 logfs_init_candlist(&super
->s_reserve_list
,
692 super
->s_bad_seg_reserve
, 1);
694 logfs_init_candlist(&super
->s_low_list
[i
], LIST_SIZE
, 0);
695 logfs_init_candlist(&super
->s_ec_list
, LIST_SIZE
, 1);
699 static void logfs_cleanup_list(struct super_block
*sb
,
700 struct candidate_list
*list
)
702 struct gc_candidate
*cand
;
704 while (list
->count
) {
705 cand
= rb_entry(list
->rb_tree
.rb_node
, struct gc_candidate
,
707 remove_from_list(cand
);
708 free_candidate(sb
, cand
);
710 BUG_ON(list
->rb_tree
.rb_node
);
713 void logfs_cleanup_gc(struct super_block
*sb
)
715 struct logfs_super
*super
= logfs_super(sb
);
718 if (!super
->s_free_list
.count
)
722 * FIXME: The btree may still contain a single empty node. So we
723 * call the grim visitor to clean up that mess. Btree code should
724 * do it for us, really.
726 btree_grim_visitor32(&super
->s_cand_tree
, 0, NULL
);
727 logfs_cleanup_list(sb
, &super
->s_free_list
);
728 logfs_cleanup_list(sb
, &super
->s_reserve_list
);
730 logfs_cleanup_list(sb
, &super
->s_low_list
[i
]);
731 logfs_cleanup_list(sb
, &super
->s_ec_list
);