2 * Swap block device support for MTDs
3 * Turns an MTD device into a swap device with block wear leveling
5 * Copyright © 2007,2011 Nokia Corporation. All rights reserved.
7 * Authors: Jarkko Lavinen <jarkko.lavinen@nokia.com>
9 * Based on Richard Purdie's earlier implementation in 2007. Background
10 * support and lock-less operation written by Adrian Hunter.
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * version 2 as published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 #include <linux/kernel.h>
28 #include <linux/module.h>
29 #include <linux/mtd/mtd.h>
30 #include <linux/mtd/blktrans.h>
31 #include <linux/rbtree.h>
32 #include <linux/sched.h>
33 #include <linux/slab.h>
34 #include <linux/vmalloc.h>
35 #include <linux/genhd.h>
36 #include <linux/swap.h>
37 #include <linux/debugfs.h>
38 #include <linux/seq_file.h>
39 #include <linux/device.h>
40 #include <linux/math64.h>
42 #define MTDSWAP_PREFIX "mtdswap"
45 * The number of free eraseblocks when GC should stop
47 #define CLEAN_BLOCK_THRESHOLD 20
50 * Number of free eraseblocks below which GC can also collect low frag
53 #define LOW_FRAG_GC_THRESHOLD 5
56 * Wear level cost amortization. We want to do wear leveling on the background
57 * without disturbing gc too much. This is made by defining max GC frequency.
58 * Frequency value 6 means 1/6 of the GC passes will pick an erase block based
59 * on the biggest wear difference rather than the biggest dirtiness.
61 * The lower freq2 should be chosen so that it makes sure the maximum erase
62 * difference will decrease even if a malicious application is deliberately
63 * trying to make erase differences large.
65 #define MAX_ERASE_DIFF 4000
66 #define COLLECT_NONDIRTY_BASE MAX_ERASE_DIFF
67 #define COLLECT_NONDIRTY_FREQ1 6
68 #define COLLECT_NONDIRTY_FREQ2 4
70 #define PAGE_UNDEF UINT_MAX
71 #define BLOCK_UNDEF UINT_MAX
72 #define BLOCK_ERROR (UINT_MAX - 1)
73 #define BLOCK_MAX (UINT_MAX - 2)
75 #define EBLOCK_BAD (1 << 0)
76 #define EBLOCK_NOMAGIC (1 << 1)
77 #define EBLOCK_BITFLIP (1 << 2)
78 #define EBLOCK_FAILED (1 << 3)
79 #define EBLOCK_READERR (1 << 4)
80 #define EBLOCK_IDX_SHIFT 5
87 unsigned int active_count
;
88 unsigned int erase_count
;
89 unsigned int pad
; /* speeds up pointer decrement */
92 #define MTDSWAP_ECNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swap_eb, \
94 #define MTDSWAP_ECNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swap_eb, \
114 struct mtd_blktrans_dev
*mbd_dev
;
115 struct mtd_info
*mtd
;
118 unsigned int *page_data
;
119 unsigned int *revmap
;
122 unsigned int spare_eblks
;
123 unsigned int pages_per_eblk
;
124 unsigned int max_erase_count
;
125 struct swap_eb
*eb_data
;
127 struct mtdswap_tree trees
[MTDSWAP_TREE_CNT
];
129 unsigned long long sect_read_count
;
130 unsigned long long sect_write_count
;
131 unsigned long long mtd_write_count
;
132 unsigned long long mtd_read_count
;
133 unsigned long long discard_count
;
134 unsigned long long discard_page_count
;
136 unsigned int curr_write_pos
;
137 struct swap_eb
*curr_write
;
143 struct mtdswap_oobdata
{
148 #define MTDSWAP_MAGIC_CLEAN 0x2095
149 #define MTDSWAP_MAGIC_DIRTY (MTDSWAP_MAGIC_CLEAN + 1)
150 #define MTDSWAP_TYPE_CLEAN 0
151 #define MTDSWAP_TYPE_DIRTY 1
152 #define MTDSWAP_OOBSIZE sizeof(struct mtdswap_oobdata)
154 #define MTDSWAP_ERASE_RETRIES 3 /* Before marking erase block bad */
155 #define MTDSWAP_IO_RETRIES 3
158 MTDSWAP_SCANNED_CLEAN
,
159 MTDSWAP_SCANNED_DIRTY
,
160 MTDSWAP_SCANNED_BITFLIP
,
165 * In the worst case mtdswap_writesect() has allocated the last clean
166 * page from the current block and is then pre-empted by the GC
167 * thread. The thread can consume a full erase block when moving a
170 #define MIN_SPARE_EBLOCKS 2
171 #define MIN_ERASE_BLOCKS (MIN_SPARE_EBLOCKS + 1)
173 #define TREE_ROOT(d, name) (&d->trees[MTDSWAP_ ## name].root)
174 #define TREE_EMPTY(d, name) (TREE_ROOT(d, name)->rb_node == NULL)
175 #define TREE_NONEMPTY(d, name) (!TREE_EMPTY(d, name))
176 #define TREE_COUNT(d, name) (d->trees[MTDSWAP_ ## name].count)
178 #define MTDSWAP_MBD_TO_MTDSWAP(dev) ((struct mtdswap_dev *)dev->priv)
180 static char partitions
[128] = "";
181 module_param_string(partitions
, partitions
, sizeof(partitions
), 0444);
182 MODULE_PARM_DESC(partitions
, "MTD partition numbers to use as swap "
183 "partitions=\"1,3,5\"");
185 static unsigned int spare_eblocks
= 10;
186 module_param(spare_eblocks
, uint
, 0444);
187 MODULE_PARM_DESC(spare_eblocks
, "Percentage of spare erase blocks for "
188 "garbage collection (default 10%)");
190 static bool header
; /* false */
191 module_param(header
, bool, 0444);
192 MODULE_PARM_DESC(header
,
193 "Include builtin swap header (default 0, without header)");
195 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
);
197 static loff_t
mtdswap_eb_offset(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
199 return (loff_t
)(eb
- d
->eb_data
) * d
->mtd
->erasesize
;
202 static void mtdswap_eb_detach(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
205 struct mtdswap_tree
*tp
;
208 tp
= container_of(eb
->root
, struct mtdswap_tree
, root
);
209 oldidx
= tp
- &d
->trees
[0];
211 d
->trees
[oldidx
].count
--;
212 rb_erase(&eb
->rb
, eb
->root
);
216 static void __mtdswap_rb_add(struct rb_root
*root
, struct swap_eb
*eb
)
218 struct rb_node
**p
, *parent
= NULL
;
224 cur
= rb_entry(parent
, struct swap_eb
, rb
);
225 if (eb
->erase_count
> cur
->erase_count
)
231 rb_link_node(&eb
->rb
, parent
, p
);
232 rb_insert_color(&eb
->rb
, root
);
235 static void mtdswap_rb_add(struct mtdswap_dev
*d
, struct swap_eb
*eb
, int idx
)
237 struct rb_root
*root
;
239 if (eb
->root
== &d
->trees
[idx
].root
)
242 mtdswap_eb_detach(d
, eb
);
243 root
= &d
->trees
[idx
].root
;
244 __mtdswap_rb_add(root
, eb
);
246 d
->trees
[idx
].count
++;
249 static struct rb_node
*mtdswap_rb_index(struct rb_root
*root
, unsigned int idx
)
256 while (i
< idx
&& p
) {
264 static int mtdswap_handle_badblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
270 eb
->flags
|= EBLOCK_BAD
;
271 mtdswap_eb_detach(d
, eb
);
274 /* badblocks not supported */
275 if (!mtd_can_have_bb(d
->mtd
))
278 offset
= mtdswap_eb_offset(d
, eb
);
279 dev_warn(d
->dev
, "Marking bad block at %08llx\n", offset
);
280 ret
= mtd_block_markbad(d
->mtd
, offset
);
283 dev_warn(d
->dev
, "Mark block bad failed for block at %08llx "
284 "error %d\n", offset
, ret
);
292 static int mtdswap_handle_write_error(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
294 unsigned int marked
= eb
->flags
& EBLOCK_FAILED
;
295 struct swap_eb
*curr_write
= d
->curr_write
;
297 eb
->flags
|= EBLOCK_FAILED
;
298 if (curr_write
== eb
) {
299 d
->curr_write
= NULL
;
301 if (!marked
&& d
->curr_write_pos
!= 0) {
302 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
307 return mtdswap_handle_badblock(d
, eb
);
310 static int mtdswap_read_oob(struct mtdswap_dev
*d
, loff_t from
,
311 struct mtd_oob_ops
*ops
)
313 int ret
= mtd_read_oob(d
->mtd
, from
, ops
);
315 if (mtd_is_bitflip(ret
))
319 dev_warn(d
->dev
, "Read OOB failed %d for block at %08llx\n",
324 if (ops
->oobretlen
< ops
->ooblen
) {
325 dev_warn(d
->dev
, "Read OOB return short read (%zd bytes not "
326 "%zd) for block at %08llx\n",
327 ops
->oobretlen
, ops
->ooblen
, from
);
334 static int mtdswap_read_markers(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
336 struct mtdswap_oobdata
*data
, *data2
;
339 struct mtd_oob_ops ops
;
341 offset
= mtdswap_eb_offset(d
, eb
);
343 /* Check first if the block is bad. */
344 if (mtd_can_have_bb(d
->mtd
) && mtd_block_isbad(d
->mtd
, offset
))
345 return MTDSWAP_SCANNED_BAD
;
347 ops
.ooblen
= 2 * d
->mtd
->oobavail
;
348 ops
.oobbuf
= d
->oob_buf
;
351 ops
.mode
= MTD_OPS_AUTO_OOB
;
353 ret
= mtdswap_read_oob(d
, offset
, &ops
);
355 if (ret
&& !mtd_is_bitflip(ret
))
358 data
= (struct mtdswap_oobdata
*)d
->oob_buf
;
359 data2
= (struct mtdswap_oobdata
*)
360 (d
->oob_buf
+ d
->mtd
->oobavail
);
362 if (le16_to_cpu(data
->magic
) == MTDSWAP_MAGIC_CLEAN
) {
363 eb
->erase_count
= le32_to_cpu(data
->count
);
364 if (mtd_is_bitflip(ret
))
365 ret
= MTDSWAP_SCANNED_BITFLIP
;
367 if (le16_to_cpu(data2
->magic
) == MTDSWAP_MAGIC_DIRTY
)
368 ret
= MTDSWAP_SCANNED_DIRTY
;
370 ret
= MTDSWAP_SCANNED_CLEAN
;
373 eb
->flags
|= EBLOCK_NOMAGIC
;
374 ret
= MTDSWAP_SCANNED_DIRTY
;
380 static int mtdswap_write_marker(struct mtdswap_dev
*d
, struct swap_eb
*eb
,
383 struct mtdswap_oobdata n
;
386 struct mtd_oob_ops ops
;
389 ops
.oobbuf
= (uint8_t *)&n
;
390 ops
.mode
= MTD_OPS_AUTO_OOB
;
393 if (marker
== MTDSWAP_TYPE_CLEAN
) {
394 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_CLEAN
);
395 n
.count
= cpu_to_le32(eb
->erase_count
);
396 ops
.ooblen
= MTDSWAP_OOBSIZE
;
397 offset
= mtdswap_eb_offset(d
, eb
);
399 n
.magic
= cpu_to_le16(MTDSWAP_MAGIC_DIRTY
);
400 ops
.ooblen
= sizeof(n
.magic
);
401 offset
= mtdswap_eb_offset(d
, eb
) + d
->mtd
->writesize
;
404 ret
= mtd_write_oob(d
->mtd
, offset
, &ops
);
407 dev_warn(d
->dev
, "Write OOB failed for block at %08llx "
408 "error %d\n", offset
, ret
);
409 if (ret
== -EIO
|| mtd_is_eccerr(ret
))
410 mtdswap_handle_write_error(d
, eb
);
414 if (ops
.oobretlen
!= ops
.ooblen
) {
415 dev_warn(d
->dev
, "Short OOB write for block at %08llx: "
417 offset
, ops
.oobretlen
, ops
.ooblen
);
425 * Are there any erase blocks without MAGIC_CLEAN header, presumably
426 * because power was cut off after erase but before header write? We
427 * need to guestimate the erase count.
429 static void mtdswap_check_counts(struct mtdswap_dev
*d
)
431 struct rb_root hist_root
= RB_ROOT
;
432 struct rb_node
*medrb
;
434 unsigned int i
, cnt
, median
;
437 for (i
= 0; i
< d
->eblks
; i
++) {
440 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
443 __mtdswap_rb_add(&hist_root
, eb
);
450 medrb
= mtdswap_rb_index(&hist_root
, cnt
/ 2);
451 median
= rb_entry(medrb
, struct swap_eb
, rb
)->erase_count
;
453 d
->max_erase_count
= MTDSWAP_ECNT_MAX(&hist_root
);
455 for (i
= 0; i
< d
->eblks
; i
++) {
458 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_READERR
))
459 eb
->erase_count
= median
;
461 if (eb
->flags
& (EBLOCK_NOMAGIC
| EBLOCK_BAD
| EBLOCK_READERR
))
464 rb_erase(&eb
->rb
, &hist_root
);
468 static void mtdswap_scan_eblks(struct mtdswap_dev
*d
)
474 for (i
= 0; i
< d
->eblks
; i
++) {
477 status
= mtdswap_read_markers(d
, eb
);
479 eb
->flags
|= EBLOCK_READERR
;
480 else if (status
== MTDSWAP_SCANNED_BAD
) {
481 eb
->flags
|= EBLOCK_BAD
;
486 case MTDSWAP_SCANNED_CLEAN
:
489 case MTDSWAP_SCANNED_DIRTY
:
490 case MTDSWAP_SCANNED_BITFLIP
:
494 idx
= MTDSWAP_FAILING
;
497 eb
->flags
|= (idx
<< EBLOCK_IDX_SHIFT
);
500 mtdswap_check_counts(d
);
502 for (i
= 0; i
< d
->eblks
; i
++) {
505 if (eb
->flags
& EBLOCK_BAD
)
508 idx
= eb
->flags
>> EBLOCK_IDX_SHIFT
;
509 mtdswap_rb_add(d
, eb
, idx
);
514 * Place eblk into a tree corresponding to its number of active blocks
517 static void mtdswap_store_eb(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
519 unsigned int weight
= eb
->active_count
;
520 unsigned int maxweight
= d
->pages_per_eblk
;
522 if (eb
== d
->curr_write
)
525 if (eb
->flags
& EBLOCK_BITFLIP
)
526 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
527 else if (eb
->flags
& (EBLOCK_READERR
| EBLOCK_FAILED
))
528 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
529 if (weight
== maxweight
)
530 mtdswap_rb_add(d
, eb
, MTDSWAP_USED
);
531 else if (weight
== 0)
532 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
533 else if (weight
> (maxweight
/2))
534 mtdswap_rb_add(d
, eb
, MTDSWAP_LOWFRAG
);
536 mtdswap_rb_add(d
, eb
, MTDSWAP_HIFRAG
);
539 static int mtdswap_erase_block(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
541 struct mtd_info
*mtd
= d
->mtd
;
542 struct erase_info erase
;
543 unsigned int retries
= 0;
547 if (eb
->erase_count
> d
->max_erase_count
)
548 d
->max_erase_count
= eb
->erase_count
;
551 memset(&erase
, 0, sizeof(struct erase_info
));
552 erase
.addr
= mtdswap_eb_offset(d
, eb
);
553 erase
.len
= mtd
->erasesize
;
555 ret
= mtd_erase(mtd
, &erase
);
557 if (retries
++ < MTDSWAP_ERASE_RETRIES
) {
559 "erase of erase block %#llx on %s failed",
560 erase
.addr
, mtd
->name
);
565 dev_err(d
->dev
, "Cannot erase erase block %#llx on %s\n",
566 erase
.addr
, mtd
->name
);
568 mtdswap_handle_badblock(d
, eb
);
575 static int mtdswap_map_free_block(struct mtdswap_dev
*d
, unsigned int page
,
579 struct swap_eb
*old_eb
= d
->curr_write
;
580 struct rb_root
*clean_root
;
583 if (old_eb
== NULL
|| d
->curr_write_pos
>= d
->pages_per_eblk
) {
585 if (TREE_EMPTY(d
, CLEAN
))
588 clean_root
= TREE_ROOT(d
, CLEAN
);
589 eb
= rb_entry(rb_first(clean_root
), struct swap_eb
, rb
);
590 rb_erase(&eb
->rb
, clean_root
);
592 TREE_COUNT(d
, CLEAN
)--;
594 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_DIRTY
);
595 } while (ret
== -EIO
|| mtd_is_eccerr(ret
));
600 d
->curr_write_pos
= 0;
603 mtdswap_store_eb(d
, old_eb
);
606 *block
= (d
->curr_write
- d
->eb_data
) * d
->pages_per_eblk
+
609 d
->curr_write
->active_count
++;
610 d
->revmap
[*block
] = page
;
616 static unsigned int mtdswap_free_page_cnt(struct mtdswap_dev
*d
)
618 return TREE_COUNT(d
, CLEAN
) * d
->pages_per_eblk
+
619 d
->pages_per_eblk
- d
->curr_write_pos
;
622 static unsigned int mtdswap_enough_free_pages(struct mtdswap_dev
*d
)
624 return mtdswap_free_page_cnt(d
) > d
->pages_per_eblk
;
627 static int mtdswap_write_block(struct mtdswap_dev
*d
, char *buf
,
628 unsigned int page
, unsigned int *bp
, int gc_context
)
630 struct mtd_info
*mtd
= d
->mtd
;
638 while (!mtdswap_enough_free_pages(d
))
639 if (mtdswap_gc(d
, 0) > 0)
642 ret
= mtdswap_map_free_block(d
, page
, bp
);
643 eb
= d
->eb_data
+ (*bp
/ d
->pages_per_eblk
);
645 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
646 d
->curr_write
= NULL
;
648 d
->revmap
[*bp
] = PAGE_UNDEF
;
655 writepos
= (loff_t
)*bp
<< PAGE_SHIFT
;
656 ret
= mtd_write(mtd
, writepos
, PAGE_SIZE
, &retlen
, buf
);
657 if (ret
== -EIO
|| mtd_is_eccerr(ret
)) {
660 d
->revmap
[*bp
] = PAGE_UNDEF
;
661 mtdswap_handle_write_error(d
, eb
);
666 dev_err(d
->dev
, "Write to MTD device failed: %d (%zd written)",
671 if (retlen
!= PAGE_SIZE
) {
672 dev_err(d
->dev
, "Short write to MTD device: %zd written",
683 d
->revmap
[*bp
] = PAGE_UNDEF
;
688 static int mtdswap_move_block(struct mtdswap_dev
*d
, unsigned int oldblock
,
689 unsigned int *newblock
)
691 struct mtd_info
*mtd
= d
->mtd
;
692 struct swap_eb
*eb
, *oldeb
;
695 unsigned int page
, retries
;
698 page
= d
->revmap
[oldblock
];
699 readpos
= (loff_t
) oldblock
<< PAGE_SHIFT
;
703 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, d
->page_buf
);
705 if (ret
< 0 && !mtd_is_bitflip(ret
)) {
706 oldeb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
707 oldeb
->flags
|= EBLOCK_READERR
;
709 dev_err(d
->dev
, "Read Error: %d (block %u)\n", ret
,
712 if (retries
< MTDSWAP_IO_RETRIES
)
718 if (retlen
!= PAGE_SIZE
) {
719 dev_err(d
->dev
, "Short read: %zd (block %u)\n", retlen
,
725 ret
= mtdswap_write_block(d
, d
->page_buf
, page
, newblock
, 1);
727 d
->page_data
[page
] = BLOCK_ERROR
;
728 dev_err(d
->dev
, "Write error: %d\n", ret
);
732 eb
= d
->eb_data
+ *newblock
/ d
->pages_per_eblk
;
733 d
->page_data
[page
] = *newblock
;
734 d
->revmap
[oldblock
] = PAGE_UNDEF
;
735 eb
= d
->eb_data
+ oldblock
/ d
->pages_per_eblk
;
741 d
->page_data
[page
] = BLOCK_ERROR
;
742 d
->revmap
[oldblock
] = PAGE_UNDEF
;
746 static int mtdswap_gc_eblock(struct mtdswap_dev
*d
, struct swap_eb
*eb
)
748 unsigned int i
, block
, eblk_base
, newblock
;
752 eblk_base
= (eb
- d
->eb_data
) * d
->pages_per_eblk
;
754 for (i
= 0; i
< d
->pages_per_eblk
; i
++) {
755 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
758 block
= eblk_base
+ i
;
759 if (d
->revmap
[block
] == PAGE_UNDEF
)
762 ret
= mtdswap_move_block(d
, block
, &newblock
);
763 if (ret
< 0 && !errcode
)
770 static int __mtdswap_choose_gc_tree(struct mtdswap_dev
*d
)
774 if (TREE_COUNT(d
, CLEAN
) < LOW_FRAG_GC_THRESHOLD
)
775 stopat
= MTDSWAP_LOWFRAG
;
777 stopat
= MTDSWAP_HIFRAG
;
779 for (idx
= MTDSWAP_BITFLIP
; idx
>= stopat
; idx
--)
780 if (d
->trees
[idx
].root
.rb_node
!= NULL
)
786 static int mtdswap_wlfreq(unsigned int maxdiff
)
788 unsigned int h
, x
, y
, dist
, base
;
791 * Calculate linear ramp down from f1 to f2 when maxdiff goes from
792 * MAX_ERASE_DIFF to MAX_ERASE_DIFF + COLLECT_NONDIRTY_BASE. Similar
793 * to triangle with height f1 - f1 and width COLLECT_NONDIRTY_BASE.
796 dist
= maxdiff
- MAX_ERASE_DIFF
;
797 if (dist
> COLLECT_NONDIRTY_BASE
)
798 dist
= COLLECT_NONDIRTY_BASE
;
801 * Modelling the slop as right angular triangle with base
802 * COLLECT_NONDIRTY_BASE and height freq1 - freq2. The ratio y/x is
803 * equal to the ratio h/base.
805 h
= COLLECT_NONDIRTY_FREQ1
- COLLECT_NONDIRTY_FREQ2
;
806 base
= COLLECT_NONDIRTY_BASE
;
809 y
= (x
* h
+ base
/ 2) / base
;
811 return COLLECT_NONDIRTY_FREQ2
+ y
;
814 static int mtdswap_choose_wl_tree(struct mtdswap_dev
*d
)
816 static unsigned int pick_cnt
;
817 unsigned int i
, idx
= -1, wear
, max
;
818 struct rb_root
*root
;
821 for (i
= 0; i
<= MTDSWAP_DIRTY
; i
++) {
822 root
= &d
->trees
[i
].root
;
823 if (root
->rb_node
== NULL
)
826 wear
= d
->max_erase_count
- MTDSWAP_ECNT_MIN(root
);
833 if (max
> MAX_ERASE_DIFF
&& pick_cnt
>= mtdswap_wlfreq(max
) - 1) {
842 static int mtdswap_choose_gc_tree(struct mtdswap_dev
*d
,
843 unsigned int background
)
847 if (TREE_NONEMPTY(d
, FAILING
) &&
848 (background
|| (TREE_EMPTY(d
, CLEAN
) && TREE_EMPTY(d
, DIRTY
))))
849 return MTDSWAP_FAILING
;
851 idx
= mtdswap_choose_wl_tree(d
);
852 if (idx
>= MTDSWAP_CLEAN
)
855 return __mtdswap_choose_gc_tree(d
);
858 static struct swap_eb
*mtdswap_pick_gc_eblk(struct mtdswap_dev
*d
,
859 unsigned int background
)
861 struct rb_root
*rp
= NULL
;
862 struct swap_eb
*eb
= NULL
;
865 if (background
&& TREE_COUNT(d
, CLEAN
) > CLEAN_BLOCK_THRESHOLD
&&
866 TREE_EMPTY(d
, DIRTY
) && TREE_EMPTY(d
, FAILING
))
869 idx
= mtdswap_choose_gc_tree(d
, background
);
873 rp
= &d
->trees
[idx
].root
;
874 eb
= rb_entry(rb_first(rp
), struct swap_eb
, rb
);
876 rb_erase(&eb
->rb
, rp
);
878 d
->trees
[idx
].count
--;
882 static unsigned int mtdswap_test_patt(unsigned int i
)
884 return i
% 2 ? 0x55555555 : 0xAAAAAAAA;
887 static unsigned int mtdswap_eblk_passes(struct mtdswap_dev
*d
,
890 struct mtd_info
*mtd
= d
->mtd
;
891 unsigned int test
, i
, j
, patt
, mtd_pages
;
893 unsigned int *p1
= (unsigned int *)d
->page_buf
;
894 unsigned char *p2
= (unsigned char *)d
->oob_buf
;
895 struct mtd_oob_ops ops
;
898 ops
.mode
= MTD_OPS_AUTO_OOB
;
899 ops
.len
= mtd
->writesize
;
900 ops
.ooblen
= mtd
->oobavail
;
902 ops
.datbuf
= d
->page_buf
;
903 ops
.oobbuf
= d
->oob_buf
;
904 base
= mtdswap_eb_offset(d
, eb
);
905 mtd_pages
= d
->pages_per_eblk
* PAGE_SIZE
/ mtd
->writesize
;
907 for (test
= 0; test
< 2; test
++) {
909 for (i
= 0; i
< mtd_pages
; i
++) {
910 patt
= mtdswap_test_patt(test
+ i
);
911 memset(d
->page_buf
, patt
, mtd
->writesize
);
912 memset(d
->oob_buf
, patt
, mtd
->oobavail
);
913 ret
= mtd_write_oob(mtd
, pos
, &ops
);
917 pos
+= mtd
->writesize
;
921 for (i
= 0; i
< mtd_pages
; i
++) {
922 ret
= mtd_read_oob(mtd
, pos
, &ops
);
926 patt
= mtdswap_test_patt(test
+ i
);
927 for (j
= 0; j
< mtd
->writesize
/sizeof(int); j
++)
931 for (j
= 0; j
< mtd
->oobavail
; j
++)
932 if (p2
[j
] != (unsigned char)patt
)
935 pos
+= mtd
->writesize
;
938 ret
= mtdswap_erase_block(d
, eb
);
943 eb
->flags
&= ~EBLOCK_READERR
;
947 mtdswap_handle_badblock(d
, eb
);
951 static int mtdswap_gc(struct mtdswap_dev
*d
, unsigned int background
)
956 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
959 eb
= mtdswap_pick_gc_eblk(d
, background
);
963 ret
= mtdswap_gc_eblock(d
, eb
);
967 if (eb
->flags
& EBLOCK_FAILED
) {
968 mtdswap_handle_badblock(d
, eb
);
972 eb
->flags
&= ~EBLOCK_BITFLIP
;
973 ret
= mtdswap_erase_block(d
, eb
);
974 if ((eb
->flags
& EBLOCK_READERR
) &&
975 (ret
|| !mtdswap_eblk_passes(d
, eb
)))
979 ret
= mtdswap_write_marker(d
, eb
, MTDSWAP_TYPE_CLEAN
);
982 mtdswap_rb_add(d
, eb
, MTDSWAP_CLEAN
);
983 else if (ret
!= -EIO
&& !mtd_is_eccerr(ret
))
984 mtdswap_rb_add(d
, eb
, MTDSWAP_DIRTY
);
989 static void mtdswap_background(struct mtd_blktrans_dev
*dev
)
991 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
995 ret
= mtdswap_gc(d
, 1);
996 if (ret
|| mtd_blktrans_cease_background(dev
))
1001 static void mtdswap_cleanup(struct mtdswap_dev
*d
)
1005 vfree(d
->page_data
);
1010 static int mtdswap_flush(struct mtd_blktrans_dev
*dev
)
1012 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1018 static unsigned int mtdswap_badblocks(struct mtd_info
*mtd
, uint64_t size
)
1021 unsigned int badcnt
;
1025 if (mtd_can_have_bb(mtd
))
1026 for (offset
= 0; offset
< size
; offset
+= mtd
->erasesize
)
1027 if (mtd_block_isbad(mtd
, offset
))
1033 static int mtdswap_writesect(struct mtd_blktrans_dev
*dev
,
1034 unsigned long page
, char *buf
)
1036 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1037 unsigned int newblock
, mapped
;
1041 d
->sect_write_count
++;
1043 if (d
->spare_eblks
< MIN_SPARE_EBLOCKS
)
1047 /* Ignore writes to the header page */
1048 if (unlikely(page
== 0))
1054 mapped
= d
->page_data
[page
];
1055 if (mapped
<= BLOCK_MAX
) {
1056 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1058 mtdswap_store_eb(d
, eb
);
1059 d
->page_data
[page
] = BLOCK_UNDEF
;
1060 d
->revmap
[mapped
] = PAGE_UNDEF
;
1063 ret
= mtdswap_write_block(d
, buf
, page
, &newblock
, 0);
1064 d
->mtd_write_count
++;
1069 eb
= d
->eb_data
+ (newblock
/ d
->pages_per_eblk
);
1070 d
->page_data
[page
] = newblock
;
1075 /* Provide a dummy swap header for the kernel */
1076 static int mtdswap_auto_header(struct mtdswap_dev
*d
, char *buf
)
1078 union swap_header
*hd
= (union swap_header
*)(buf
);
1080 memset(buf
, 0, PAGE_SIZE
- 10);
1082 hd
->info
.version
= 1;
1083 hd
->info
.last_page
= d
->mbd_dev
->size
- 1;
1084 hd
->info
.nr_badpages
= 0;
1086 memcpy(buf
+ PAGE_SIZE
- 10, "SWAPSPACE2", 10);
1091 static int mtdswap_readsect(struct mtd_blktrans_dev
*dev
,
1092 unsigned long page
, char *buf
)
1094 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1095 struct mtd_info
*mtd
= d
->mtd
;
1096 unsigned int realblock
, retries
;
1102 d
->sect_read_count
++;
1105 if (unlikely(page
== 0))
1106 return mtdswap_auto_header(d
, buf
);
1111 realblock
= d
->page_data
[page
];
1112 if (realblock
> BLOCK_MAX
) {
1113 memset(buf
, 0x0, PAGE_SIZE
);
1114 if (realblock
== BLOCK_UNDEF
)
1120 eb
= d
->eb_data
+ (realblock
/ d
->pages_per_eblk
);
1121 BUG_ON(d
->revmap
[realblock
] == PAGE_UNDEF
);
1123 readpos
= (loff_t
)realblock
<< PAGE_SHIFT
;
1127 ret
= mtd_read(mtd
, readpos
, PAGE_SIZE
, &retlen
, buf
);
1129 d
->mtd_read_count
++;
1130 if (mtd_is_bitflip(ret
)) {
1131 eb
->flags
|= EBLOCK_BITFLIP
;
1132 mtdswap_rb_add(d
, eb
, MTDSWAP_BITFLIP
);
1137 dev_err(d
->dev
, "Read error %d\n", ret
);
1138 eb
->flags
|= EBLOCK_READERR
;
1139 mtdswap_rb_add(d
, eb
, MTDSWAP_FAILING
);
1141 if (retries
< MTDSWAP_IO_RETRIES
)
1147 if (retlen
!= PAGE_SIZE
) {
1148 dev_err(d
->dev
, "Short read %zd\n", retlen
);
1155 static int mtdswap_discard(struct mtd_blktrans_dev
*dev
, unsigned long first
,
1158 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1161 unsigned int mapped
;
1165 for (page
= first
; page
< first
+ nr_pages
; page
++) {
1166 mapped
= d
->page_data
[page
];
1167 if (mapped
<= BLOCK_MAX
) {
1168 eb
= d
->eb_data
+ (mapped
/ d
->pages_per_eblk
);
1170 mtdswap_store_eb(d
, eb
);
1171 d
->page_data
[page
] = BLOCK_UNDEF
;
1172 d
->revmap
[mapped
] = PAGE_UNDEF
;
1173 d
->discard_page_count
++;
1174 } else if (mapped
== BLOCK_ERROR
) {
1175 d
->page_data
[page
] = BLOCK_UNDEF
;
1176 d
->discard_page_count
++;
1183 static int mtdswap_show(struct seq_file
*s
, void *data
)
1185 struct mtdswap_dev
*d
= (struct mtdswap_dev
*) s
->private;
1187 unsigned int count
[MTDSWAP_TREE_CNT
];
1188 unsigned int min
[MTDSWAP_TREE_CNT
];
1189 unsigned int max
[MTDSWAP_TREE_CNT
];
1190 unsigned int i
, cw
= 0, cwp
= 0, cwecount
= 0, bb_cnt
, mapped
, pages
;
1192 static const char * const name
[] = {
1193 "clean", "used", "low", "high", "dirty", "bitflip", "failing"
1196 mutex_lock(&d
->mbd_dev
->lock
);
1198 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1199 struct rb_root
*root
= &d
->trees
[i
].root
;
1201 if (root
->rb_node
) {
1202 count
[i
] = d
->trees
[i
].count
;
1203 min
[i
] = MTDSWAP_ECNT_MIN(root
);
1204 max
[i
] = MTDSWAP_ECNT_MAX(root
);
1209 if (d
->curr_write
) {
1211 cwp
= d
->curr_write_pos
;
1212 cwecount
= d
->curr_write
->erase_count
;
1216 for (i
= 0; i
< d
->eblks
; i
++)
1217 sum
+= d
->eb_data
[i
].erase_count
;
1219 use_size
= (uint64_t)d
->eblks
* d
->mtd
->erasesize
;
1220 bb_cnt
= mtdswap_badblocks(d
->mtd
, use_size
);
1223 pages
= d
->mbd_dev
->size
;
1224 for (i
= 0; i
< pages
; i
++)
1225 if (d
->page_data
[i
] != BLOCK_UNDEF
)
1228 mutex_unlock(&d
->mbd_dev
->lock
);
1230 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++) {
1234 if (min
[i
] != max
[i
])
1235 seq_printf(s
, "%s:\t%5d erase blocks, erased min %d, "
1237 name
[i
], count
[i
], min
[i
], max
[i
]);
1239 seq_printf(s
, "%s:\t%5d erase blocks, all erased %d "
1240 "times\n", name
[i
], count
[i
], min
[i
]);
1244 seq_printf(s
, "bad:\t%5u erase blocks\n", bb_cnt
);
1247 seq_printf(s
, "current erase block: %u pages used, %u free, "
1248 "erased %u times\n",
1249 cwp
, d
->pages_per_eblk
- cwp
, cwecount
);
1251 seq_printf(s
, "total erasures: %lu\n", sum
);
1255 seq_printf(s
, "mtdswap_readsect count: %llu\n", d
->sect_read_count
);
1256 seq_printf(s
, "mtdswap_writesect count: %llu\n", d
->sect_write_count
);
1257 seq_printf(s
, "mtdswap_discard count: %llu\n", d
->discard_count
);
1258 seq_printf(s
, "mtd read count: %llu\n", d
->mtd_read_count
);
1259 seq_printf(s
, "mtd write count: %llu\n", d
->mtd_write_count
);
1260 seq_printf(s
, "discarded pages count: %llu\n", d
->discard_page_count
);
1263 seq_printf(s
, "total pages: %u\n", pages
);
1264 seq_printf(s
, "pages mapped: %u\n", mapped
);
1268 DEFINE_SHOW_ATTRIBUTE(mtdswap
);
1270 static int mtdswap_add_debugfs(struct mtdswap_dev
*d
)
1272 struct dentry
*root
= d
->mtd
->dbg
.dfs_dir
;
1273 struct dentry
*dent
;
1275 if (!IS_ENABLED(CONFIG_DEBUG_FS
))
1278 if (IS_ERR_OR_NULL(root
))
1281 dent
= debugfs_create_file("mtdswap_stats", S_IRUSR
, root
, d
,
1284 dev_err(d
->dev
, "debugfs_create_file failed\n");
1291 static int mtdswap_init(struct mtdswap_dev
*d
, unsigned int eblocks
,
1292 unsigned int spare_cnt
)
1294 struct mtd_info
*mtd
= d
->mbd_dev
->mtd
;
1295 unsigned int i
, eblk_bytes
, pages
, blocks
;
1300 d
->spare_eblks
= spare_cnt
;
1301 d
->pages_per_eblk
= mtd
->erasesize
>> PAGE_SHIFT
;
1303 pages
= d
->mbd_dev
->size
;
1304 blocks
= eblocks
* d
->pages_per_eblk
;
1306 for (i
= 0; i
< MTDSWAP_TREE_CNT
; i
++)
1307 d
->trees
[i
].root
= RB_ROOT
;
1309 d
->page_data
= vmalloc(array_size(pages
, sizeof(int)));
1311 goto page_data_fail
;
1313 d
->revmap
= vmalloc(array_size(blocks
, sizeof(int)));
1317 eblk_bytes
= sizeof(struct swap_eb
)*d
->eblks
;
1318 d
->eb_data
= vzalloc(eblk_bytes
);
1322 for (i
= 0; i
< pages
; i
++)
1323 d
->page_data
[i
] = BLOCK_UNDEF
;
1325 for (i
= 0; i
< blocks
; i
++)
1326 d
->revmap
[i
] = PAGE_UNDEF
;
1328 d
->page_buf
= kmalloc(PAGE_SIZE
, GFP_KERNEL
);
1332 d
->oob_buf
= kmalloc_array(2, mtd
->oobavail
, GFP_KERNEL
);
1336 mtdswap_scan_eblks(d
);
1347 vfree(d
->page_data
);
1349 printk(KERN_ERR
"%s: init failed (%d)\n", MTDSWAP_PREFIX
, ret
);
1353 static void mtdswap_add_mtd(struct mtd_blktrans_ops
*tr
, struct mtd_info
*mtd
)
1355 struct mtdswap_dev
*d
;
1356 struct mtd_blktrans_dev
*mbd_dev
;
1360 unsigned int eblocks
, eavailable
, bad_blocks
, spare_cnt
;
1361 uint64_t swap_size
, use_size
, size_limit
;
1364 parts
= &partitions
[0];
1368 while ((this_opt
= strsep(&parts
, ",")) != NULL
) {
1369 if (kstrtoul(this_opt
, 0, &part
) < 0)
1372 if (mtd
->index
== part
)
1376 if (mtd
->index
!= part
)
1379 if (mtd
->erasesize
< PAGE_SIZE
|| mtd
->erasesize
% PAGE_SIZE
) {
1380 printk(KERN_ERR
"%s: Erase size %u not multiple of PAGE_SIZE "
1381 "%lu\n", MTDSWAP_PREFIX
, mtd
->erasesize
, PAGE_SIZE
);
1385 if (PAGE_SIZE
% mtd
->writesize
|| mtd
->writesize
> PAGE_SIZE
) {
1386 printk(KERN_ERR
"%s: PAGE_SIZE %lu not multiple of write size"
1387 " %u\n", MTDSWAP_PREFIX
, PAGE_SIZE
, mtd
->writesize
);
1391 if (!mtd
->oobsize
|| mtd
->oobavail
< MTDSWAP_OOBSIZE
) {
1392 printk(KERN_ERR
"%s: Not enough free bytes in OOB, "
1393 "%d available, %zu needed.\n",
1394 MTDSWAP_PREFIX
, mtd
->oobavail
, MTDSWAP_OOBSIZE
);
1398 if (spare_eblocks
> 100)
1399 spare_eblocks
= 100;
1401 use_size
= mtd
->size
;
1402 size_limit
= (uint64_t) BLOCK_MAX
* PAGE_SIZE
;
1404 if (mtd
->size
> size_limit
) {
1405 printk(KERN_WARNING
"%s: Device too large. Limiting size to "
1406 "%llu bytes\n", MTDSWAP_PREFIX
, size_limit
);
1407 use_size
= size_limit
;
1410 eblocks
= mtd_div_by_eb(use_size
, mtd
);
1411 use_size
= (uint64_t)eblocks
* mtd
->erasesize
;
1412 bad_blocks
= mtdswap_badblocks(mtd
, use_size
);
1413 eavailable
= eblocks
- bad_blocks
;
1415 if (eavailable
< MIN_ERASE_BLOCKS
) {
1416 printk(KERN_ERR
"%s: Not enough erase blocks. %u available, "
1417 "%d needed\n", MTDSWAP_PREFIX
, eavailable
,
1422 spare_cnt
= div_u64((uint64_t)eavailable
* spare_eblocks
, 100);
1424 if (spare_cnt
< MIN_SPARE_EBLOCKS
)
1425 spare_cnt
= MIN_SPARE_EBLOCKS
;
1427 if (spare_cnt
> eavailable
- 1)
1428 spare_cnt
= eavailable
- 1;
1430 swap_size
= (uint64_t)(eavailable
- spare_cnt
) * mtd
->erasesize
+
1431 (header
? PAGE_SIZE
: 0);
1433 printk(KERN_INFO
"%s: Enabling MTD swap on device %lu, size %llu KB, "
1434 "%u spare, %u bad blocks\n",
1435 MTDSWAP_PREFIX
, part
, swap_size
/ 1024, spare_cnt
, bad_blocks
);
1437 d
= kzalloc(sizeof(struct mtdswap_dev
), GFP_KERNEL
);
1441 mbd_dev
= kzalloc(sizeof(struct mtd_blktrans_dev
), GFP_KERNEL
);
1447 d
->mbd_dev
= mbd_dev
;
1451 mbd_dev
->devnum
= mtd
->index
;
1452 mbd_dev
->size
= swap_size
>> PAGE_SHIFT
;
1455 if (!(mtd
->flags
& MTD_WRITEABLE
))
1456 mbd_dev
->readonly
= 1;
1458 if (mtdswap_init(d
, eblocks
, spare_cnt
) < 0)
1461 if (add_mtd_blktrans_dev(mbd_dev
) < 0)
1464 d
->dev
= disk_to_dev(mbd_dev
->disk
);
1466 ret
= mtdswap_add_debugfs(d
);
1468 goto debugfs_failed
;
1473 del_mtd_blktrans_dev(mbd_dev
);
1483 static void mtdswap_remove_dev(struct mtd_blktrans_dev
*dev
)
1485 struct mtdswap_dev
*d
= MTDSWAP_MBD_TO_MTDSWAP(dev
);
1487 del_mtd_blktrans_dev(dev
);
1492 static struct mtd_blktrans_ops mtdswap_ops
= {
1496 .blksize
= PAGE_SIZE
,
1497 .flush
= mtdswap_flush
,
1498 .readsect
= mtdswap_readsect
,
1499 .writesect
= mtdswap_writesect
,
1500 .discard
= mtdswap_discard
,
1501 .background
= mtdswap_background
,
1502 .add_mtd
= mtdswap_add_mtd
,
1503 .remove_dev
= mtdswap_remove_dev
,
1504 .owner
= THIS_MODULE
,
1507 static int __init
mtdswap_modinit(void)
1509 return register_mtd_blktrans(&mtdswap_ops
);
1512 static void __exit
mtdswap_modexit(void)
1514 deregister_mtd_blktrans(&mtdswap_ops
);
1517 module_init(mtdswap_modinit
);
1518 module_exit(mtdswap_modexit
);
1521 MODULE_LICENSE("GPL");
1522 MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
1523 MODULE_DESCRIPTION("Block device access to an MTD suitable for using as "