2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.5 2008/01/21 00:00:19 dillon Exp $
39 static int hammer_recover_buffer_stage2(hammer_cluster_t cluster
,
41 static int hammer_recover_record(hammer_cluster_t cluster
,
42 hammer_buffer_t buffer
, int32_t rec_offset
,
43 hammer_record_ondisk_t rec
);
44 static int hammer_recover_btree(hammer_cluster_t cluster
,
45 hammer_buffer_t buffer
, int32_t rec_offset
,
46 hammer_record_ondisk_t rec
);
49 * Recover a cluster. The caller has referenced and locked the cluster.
51 * Generally returns 0 on success and EIO if the recovery was unsuccessful.
54 hammer_recover(hammer_cluster_t cluster
)
64 kprintf("HAMMER_RECOVER %d:%d\n",
65 cluster
->volume
->vol_no
, cluster
->clu_no
);
66 KKASSERT(cluster
->ondisk
->synchronized_rec_id
);
68 nbuffers
= cluster
->ondisk
->clu_limit
/ HAMMER_BUFSIZE
;
69 hammer_modify_cluster(cluster
);
72 * Re-initialize the master, B-Tree, and mdata A-lists, and
73 * recover the record A-list.
75 hammer_alist_init(&cluster
->alist_master
, 1, nbuffers
- 1,
77 hammer_alist_init(&cluster
->alist_btree
,
79 (nbuffers
- 1) * HAMMER_FSBUF_MAXBLKS
,
81 hammer_alist_init(&cluster
->alist_mdata
,
83 (nbuffers
- 1) * HAMMER_FSBUF_MAXBLKS
,
85 hammer_alist_recover(&cluster
->alist_record
,
88 (nbuffers
- 1) * HAMMER_FSBUF_MAXBLKS
);
92 * Scan the cluster's clu_record_buf_bitmap, reserve record buffers
93 * from the master bitmap.
95 for (buf_no
= 1; buf_no
< nbuffers
; ++buf_no
) {
96 bitmap
= cluster
->ondisk
->clu_record_buf_bitmap
[buf_no
>> 5];
98 buf_no
= ((buf_no
+ 32) & ~31) - 1;
101 if ((bitmap
& (1 << (buf_no
& 31))) == 0)
103 r
= hammer_alist_alloc_fwd(&cluster
->alist_master
, 1, buf_no
);
104 KKASSERT(r
== buf_no
);
108 * Scan the cluster's clu_record_buf_bitmap, reassign buffers
109 * from alist_master to alist_record, and reallocate individual
110 * records and any related data reference if they meet the criteria.
112 for (buf_no
= 1; buf_no
< nbuffers
; ++buf_no
) {
113 bitmap
= cluster
->ondisk
->clu_record_buf_bitmap
[buf_no
>> 5];
115 buf_no
= ((buf_no
+ 32) & ~31) - 1;
118 if ((bitmap
& (1 << (buf_no
& 31))) == 0)
120 hammer_recover_buffer_stage1(cluster
, buf_no
);
125 * The cluster is now in good enough shape that general allocations
126 * are possible. Construct an empty B-Tree root.
132 croot
= hammer_alloc_btree(cluster
, &error
);
134 hammer_modify_node(croot
);
135 bzero(croot
->ondisk
, sizeof(*croot
->ondisk
));
136 croot
->ondisk
->count
= 0;
137 croot
->ondisk
->type
= HAMMER_BTREE_TYPE_LEAF
;
138 cluster
->ondisk
->clu_btree_root
= croot
->node_offset
;
139 hammer_rel_node(croot
);
141 KKASSERT(error
== 0);
145 * Scan the cluster's clu_record_buf_bitmap again and regenerate
151 for (buf_no
= 1; buf_no
< nbuffers
; ++buf_no
) {
152 bitmap
= cluster
->ondisk
->clu_record_buf_bitmap
[buf_no
>> 5];
154 buf_no
= ((buf_no
+ 32) & ~31) - 1;
157 if ((bitmap
& (1 << (buf_no
& 31))) == 0)
160 record_count
+= hammer_recover_buffer_stage2(cluster
, buf_no
);
162 kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n",
163 cluster
->volume
->vol_no
, cluster
->clu_no
,
164 buffer_count
, record_count
);
167 * Validate the parent cluster pointer. XXX
173 * Reassign buf_no as a record buffer and recover any related data
176 * This is used in the alist callback and must return a negative error
177 * code or a positive free block count.
180 buffer_alist_recover(void *info
, int32_t blk
, int32_t radix
, int32_t count
)
182 hammer_cluster_t cluster
;
183 hammer_record_ondisk_t rec
;
184 hammer_buffer_t buffer
;
192 * Extract cluster and buffer number to recover
195 buf_no
= blk
/ HAMMER_FSBUF_MAXBLKS
;
198 * Mark the buffer as allocated in the cluster's master A-list.
200 r
= hammer_alist_alloc_fwd(&cluster
->alist_master
, 1, buf_no
);
201 KKASSERT(r
== buf_no
);
203 kprintf("recover buffer1 %d:%d:%d\n",
204 cluster
->volume
->vol_no
,
205 cluster
->clu_no
, buf_no
);
206 buffer
= hammer_get_buffer(cluster
, buf_no
, 0, &error
);
209 * If we are unable to access the buffer leave it in a
210 * reserved state on the master alist.
212 kprintf("hammer_recover_buffer_stage1: error "
213 "recovering %d:%d:%d\n",
214 cluster
->volume
->vol_no
, cluster
->clu_no
, buf_no
);
219 * Recover the buffer, scan and validate allocated records. Records
220 * which cannot be recovered are freed.
222 * The parent a-list must be properly adjusted so don't just call
223 * hammer_alist_recover() on the underlying buffer. Go through the
226 hammer_modify_buffer(buffer
);
227 count
= hammer_alist_recover(&buffer
->alist
, 0, 0, HAMMER_RECORD_NODES
);
228 kprintf("hammer_recover_buffer count1 %d/%d\n", count
, HAMMER_RECORD_NODES
);
231 rec_no
= hammer_alist_find(&buffer
->alist
, rec_no
+ 1,
232 HAMMER_RECORD_NODES
);
233 if (rec_no
== HAMMER_ALIST_BLOCK_NONE
)
236 kprintf("recover record %d:%d:%d %d\n",
237 cluster
->volume
->vol_no
,
238 cluster
->clu_no
, buf_no
, rec_no
);
240 rec_offset
= offsetof(union hammer_fsbuf_ondisk
,
241 record
.recs
[rec_no
]);
242 rec_offset
+= buf_no
* HAMMER_BUFSIZE
;
243 rec
= &buffer
->ondisk
->record
.recs
[rec_no
];
244 error
= hammer_recover_record(cluster
, buffer
, rec_offset
, rec
);
246 kprintf("hammer_recover_record: failed %d:%d@%d\n",
247 cluster
->clu_no
, buffer
->buf_no
, rec_offset
);
248 hammer_alist_free(&buffer
->alist
, rec_no
, 1);
249 ++count
; /* free count */
252 kprintf("hammer_recover_buffer count2 %d/%d\n", count
, HAMMER_RECORD_NODES
);
253 hammer_rel_buffer(buffer
, 0);
258 * Recover a record, at least into a state that doesn't blow up the
259 * filesystem. Returns 0 on success, non-zero if the record is
263 hammer_recover_record(hammer_cluster_t cluster
, hammer_buffer_t buffer
,
264 int32_t rec_offset
, hammer_record_ondisk_t rec
)
266 hammer_buffer_t dbuf
;
267 u_int64_t syncid
= cluster
->ondisk
->synchronized_rec_id
;
278 * We have to discard any records with rec_id's greater then the
279 * last sync of the cluster header (which guarenteed all related
280 * buffers had been synced). Otherwise the record may reference
281 * information that was never synced to disk.
283 if (rec
->base
.rec_id
>= syncid
) {
284 kprintf("recover record: syncid too large %016llx/%016llx\n",
285 rec
->base
.rec_id
, syncid
);
290 /* XXX undo incomplete deletions */
291 if (rec
->base
.base
.delete_tid
> syncid
)
292 rec
->base
.base
.delete_tid
= 0;
296 * Validate the record's B-Tree key
298 if (rec
->base
.base
.rec_type
!= HAMMER_RECTYPE_CLUSTER
) {
299 if (hammer_btree_cmp(&rec
->base
.base
,
300 &cluster
->ondisk
->clu_btree_beg
) < 0) {
301 kprintf("recover record: range low\n");
304 if (hammer_btree_cmp(&rec
->base
.base
,
305 &cluster
->ondisk
->clu_btree_end
) >= 0) {
306 kprintf("recover record: range high\n");
312 * Validate the record's data. If the offset is 0 there is no data
313 * (or it is zero-fill) and we can return success immediately.
314 * Otherwise make sure everything is ok.
316 data_offset
= rec
->base
.data_offset
;
317 data_len
= rec
->base
.data_len
;
320 rec
->base
.data_offset
= data_offset
= 0;
321 if (data_offset
== 0)
325 * Non-zero data offset, recover the data
327 if (data_offset
< HAMMER_BUFSIZE
||
328 data_offset
>= cluster
->ondisk
->clu_limit
||
329 data_len
< 0 || data_len
> HAMMER_MAXDATA
||
330 data_offset
+ data_len
> cluster
->ondisk
->clu_limit
) {
331 kprintf("recover record: bad offset/len %d/%d\n",
332 data_offset
, data_len
);
337 * Check data_offset relative to rec_offset
339 if (data_offset
< rec_offset
&& data_offset
+ data_len
> rec_offset
) {
340 kprintf("recover record: bad offset: overlapping1\n");
343 if (data_offset
>= rec_offset
&&
344 data_offset
< rec_offset
+ sizeof(struct hammer_base_record
)) {
345 kprintf("recover record: bad offset: overlapping2\n");
350 * Check for data embedded in the record
352 if (data_offset
>= rec_offset
&&
353 data_offset
< rec_offset
+ HAMMER_RECORD_SIZE
) {
354 if (data_offset
+ data_len
> rec_offset
+ HAMMER_RECORD_SIZE
) {
355 kprintf("recover record: bad offset: overlapping3\n");
362 * Recover the allocated data either out of the cluster's master alist
363 * or as a buffer sub-allocation.
365 if ((data_len
& HAMMER_BUFMASK
) == 0) {
366 if (data_offset
& HAMMER_BUFMASK
) {
367 kprintf("recover record: bad offset: unaligned\n");
370 nblks
= data_len
/ HAMMER_BUFSIZE
;
371 dbuf_no
= data_offset
/ HAMMER_BUFSIZE
;
373 r
= hammer_alist_alloc_fwd(&cluster
->alist_master
,
375 if (r
== HAMMER_ALIST_BLOCK_NONE
) {
376 kprintf("recover record: cannot recover offset1\n");
380 kprintf("recover record: cannot recover offset2\n");
381 hammer_alist_free(&cluster
->alist_master
, r
, nblks
);
385 if ((data_offset
& ~HAMMER_BUFMASK
) !=
386 ((data_offset
+ data_len
- 1) & ~HAMMER_BUFMASK
)) {
387 kprintf("recover record: overlaps multiple bufs\n");
390 if ((data_offset
& HAMMER_BUFMASK
) <
391 sizeof(struct hammer_fsbuf_head
)) {
392 kprintf("recover record: data in header area\n");
395 if (data_offset
& HAMMER_DATA_BLKMASK
) {
396 kprintf("recover record: data blk unaligned\n");
401 * Ok, recover the space in the data buffer.
403 dbuf_no
= data_offset
/ HAMMER_BUFSIZE
;
404 r
= hammer_alist_alloc_fwd(&cluster
->alist_master
, 1, dbuf_no
);
405 if (r
!= dbuf_no
&& r
!= HAMMER_ALIST_BLOCK_NONE
)
406 hammer_alist_free(&cluster
->alist_master
, r
, 1);
409 * This is the first time we've tried to recover
410 * data in this data buffer, reinit it (but don't
411 * zero it out, obviously).
413 * Calling initbuffer marks the data blocks within
414 * the buffer as being free.
416 dbuf
= hammer_get_buffer(cluster
, dbuf_no
,
419 hammer_modify_buffer(dbuf
);
420 hammer_initbuffer(&dbuf
->alist
,
423 dbuf
->buf_type
= HAMMER_FSBUF_DATA
;
425 base_blk
= dbuf_no
* HAMMER_FSBUF_MAXBLKS
;
426 hammer_alist_free(&cluster
->alist_mdata
, base_blk
,
430 * We've seen this data buffer before.
432 dbuf
= hammer_get_buffer(cluster
, dbuf_no
,
436 kprintf("recover record: data: getbuf failed\n");
440 if (dbuf
->buf_type
!= HAMMER_FSBUF_DATA
) {
441 hammer_rel_buffer(dbuf
, 0);
442 kprintf("recover record: data: wrong buffer type\n");
447 * Figure out the data block number and number of blocks.
449 nblks
= (data_len
+ HAMMER_DATA_BLKMASK
) & ~HAMMER_DATA_BLKMASK
;
450 nblks
/= HAMMER_DATA_BLKSIZE
;
451 dblk_no
= ((data_offset
& HAMMER_BUFMASK
) - offsetof(union hammer_fsbuf_ondisk
, data
.data
)) / HAMMER_DATA_BLKSIZE
;
452 if ((data_offset
& HAMMER_BUFMASK
) != offsetof(union hammer_fsbuf_ondisk
, data
.data
[dblk_no
])) {
453 kprintf("dblk_no %d does not match data_offset %d/%d\n",
455 offsetof(union hammer_fsbuf_ondisk
, data
.data
[dblk_no
]),
456 (data_offset
& HAMMER_BUFMASK
));
457 hammer_rel_buffer(dbuf
, 0);
458 kprintf("recover record: data: not block aligned\n");
459 Debugger("bad data");
462 dblk_no
+= dbuf_no
* HAMMER_FSBUF_MAXBLKS
;
463 r
= hammer_alist_alloc_fwd(&cluster
->alist_mdata
, nblks
, dblk_no
);
465 if (r
!= HAMMER_ALIST_BLOCK_NONE
)
466 hammer_alist_free(&cluster
->alist_mdata
, r
, nblks
);
467 hammer_rel_buffer(dbuf
, 0);
468 kprintf("recover record: data: unable to realloc\n");
471 hammer_rel_buffer(dbuf
, 0);
478 * Rebuild the B-Tree for the records residing in the specified buffer.
480 * Return the number of records recovered.
483 hammer_recover_buffer_stage2(hammer_cluster_t cluster
, int32_t buf_no
)
485 hammer_record_ondisk_t rec
;
486 hammer_buffer_t buffer
;
489 int record_count
= 0;
492 buffer
= hammer_get_buffer(cluster
, buf_no
, 0, &error
);
495 * If we are unable to access the buffer leave it in a
496 * reserved state on the master alist.
498 kprintf("hammer_recover_buffer_stage2: error "
499 "recovering %d:%d:%d\n",
500 cluster
->volume
->vol_no
, cluster
->clu_no
, buf_no
);
505 * Recover the buffer, scan and validate allocated records. Records
506 * which cannot be recovered are freed.
510 rec_no
= hammer_alist_find(&buffer
->alist
, rec_no
+ 1,
511 HAMMER_RECORD_NODES
);
512 if (rec_no
== HAMMER_ALIST_BLOCK_NONE
)
514 rec_offset
= offsetof(union hammer_fsbuf_ondisk
,
515 record
.recs
[rec_no
]);
516 rec_offset
+= buf_no
* HAMMER_BUFSIZE
;
517 rec
= &buffer
->ondisk
->record
.recs
[rec_no
];
518 error
= hammer_recover_btree(cluster
, buffer
, rec_offset
, rec
);
520 kprintf("hammer_recover_btree: failed %d:%d@%d\n",
521 cluster
->clu_no
, buffer
->buf_no
, rec_offset
);
522 /* XXX free the record and its data? */
523 /*hammer_alist_free(&buffer->alist, rec_no, 1);*/
528 hammer_rel_buffer(buffer
, 0);
529 return(record_count
);
533 * Enter a single record into the B-Tree.
536 hammer_recover_btree(hammer_cluster_t cluster
, hammer_buffer_t buffer
,
537 int32_t rec_offset
, hammer_record_ondisk_t rec
)
539 struct hammer_cursor cursor
;
540 union hammer_btree_elm elm
;
541 hammer_cluster_t ncluster
;
545 * Check for a spike record. When spiking into a new cluster do
546 * NOT allow a recursive recovery to occur. We use a lot of
547 * stack and the only thing we actually modify in the target
548 * cluster is its parent pointer.
550 if (rec
->base
.base
.rec_type
== HAMMER_RECTYPE_CLUSTER
) {
551 hammer_volume_t ovolume
= cluster
->volume
;
552 hammer_volume_t nvolume
;
554 nvolume
= hammer_get_volume(ovolume
->hmp
, rec
->spike
.vol_no
,
558 ncluster
= hammer_get_cluster(nvolume
, rec
->spike
.clu_no
,
559 &error
, GET_CLUSTER_NORECOVER
);
560 hammer_rel_volume(nvolume
, 0);
565 * Validate the cluster. Allow the offset to be fixed up.
567 if (ncluster
->ondisk
->clu_btree_parent_vol_no
!= ovolume
->vol_no
||
568 ncluster
->ondisk
->clu_btree_parent_clu_no
!= cluster
->clu_no
) {
569 kprintf("hammer_recover: Bad cluster spike hookup: "
571 ncluster
->ondisk
->clu_btree_parent_vol_no
,
572 ncluster
->ondisk
->clu_btree_parent_clu_no
,
576 hammer_rel_cluster(ncluster
, 0);
584 * Locate the insertion point. Note that we are using the cluster-
585 * localized cursor init so parent will start out NULL.
587 * The key(s) used for spike's are bounds and different from the
588 * key embedded in the spike record. A special B-Tree insertion
589 * call is made to deal with spikes.
591 error
= hammer_init_cursor_cluster(&cursor
, cluster
);
594 KKASSERT(cursor
.node
);
596 cursor
.key_beg
= ncluster
->ondisk
->clu_btree_beg
;
598 cursor
.key_beg
= rec
->base
.base
;
599 cursor
.flags
|= HAMMER_CURSOR_INSERT
| HAMMER_CURSOR_RECOVER
;
601 error
= hammer_btree_lookup(&cursor
);
602 KKASSERT(error
!= EDEADLK
);
603 KKASSERT(cursor
.node
);
605 kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n",
606 &cursor
, rec
, ncluster
);
607 hammer_print_btree_elm(&cursor
.node
->ondisk
->elms
[cursor
.index
], HAMMER_BTREE_TYPE_LEAF
, cursor
.index
);
608 Debugger("duplicate record");
617 kprintf("recover spike clu %d %016llx-%016llx\n",
619 ncluster
->ondisk
->clu_btree_beg
.obj_id
,
620 ncluster
->ondisk
->clu_btree_end
.obj_id
);
621 error
= hammer_btree_insert_cluster(&cursor
, ncluster
,
623 kprintf("recover spike record error %d\n", error
);
624 KKASSERT(error
!= EDEADLK
);
626 Debugger("spike recovery");
632 kprintf("recover recrd clu %d %016llx\n",
633 cluster
->clu_no
, rec
->base
.base
.obj_id
);
635 elm
.leaf
.base
= rec
->base
.base
;
636 elm
.leaf
.rec_offset
= rec_offset
;
637 elm
.leaf
.data_offset
= rec
->base
.data_offset
;
638 elm
.leaf
.data_len
= rec
->base
.data_len
;
639 elm
.leaf
.data_crc
= rec
->base
.data_crc
;
641 error
= hammer_btree_insert(&cursor
, &elm
);
642 KKASSERT(error
!= EDEADLK
);
646 kprintf("hammer_recover_btree: insertion failed\n");
651 hammer_rel_cluster(ncluster
, 0);
652 hammer_done_cursor(&cursor
);