HAMMER 22/many: Recovery and B-Tree work.
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
blob63871983a129f679a19f704c6ac770733bb302ef
1 /*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.5 2008/01/21 00:00:19 dillon Exp $
37 #include "hammer.h"
39 static int hammer_recover_buffer_stage2(hammer_cluster_t cluster,
40 int32_t buf_no);
41 static int hammer_recover_record(hammer_cluster_t cluster,
42 hammer_buffer_t buffer, int32_t rec_offset,
43 hammer_record_ondisk_t rec);
44 static int hammer_recover_btree(hammer_cluster_t cluster,
45 hammer_buffer_t buffer, int32_t rec_offset,
46 hammer_record_ondisk_t rec);
49 * Recover a cluster. The caller has referenced and locked the cluster.
51 * Generally returns 0 on success and EIO if the recovery was unsuccessful.
53 int
54 hammer_recover(hammer_cluster_t cluster)
56 int buf_no;
57 int nbuffers;
58 int buffer_count;
59 int record_count;
60 u_int32_t bitmap;
62 return(0);
64 kprintf("HAMMER_RECOVER %d:%d\n",
65 cluster->volume->vol_no, cluster->clu_no);
66 KKASSERT(cluster->ondisk->synchronized_rec_id);
68 nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE;
69 hammer_modify_cluster(cluster);
72 * Re-initialize the master, B-Tree, and mdata A-lists, and
73 * recover the record A-list.
75 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
76 HAMMER_ASTATE_FREE);
77 hammer_alist_init(&cluster->alist_btree,
78 HAMMER_FSBUF_MAXBLKS,
79 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
80 HAMMER_ASTATE_ALLOC);
81 hammer_alist_init(&cluster->alist_mdata,
82 HAMMER_FSBUF_MAXBLKS,
83 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
84 HAMMER_ASTATE_ALLOC);
85 hammer_alist_recover(&cluster->alist_record,
87 HAMMER_FSBUF_MAXBLKS,
88 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS);
90 #if 0
92 * Scan the cluster's clu_record_buf_bitmap, reserve record buffers
93 * from the master bitmap.
95 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
96 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
97 if (bitmap == 0) {
98 buf_no = ((buf_no + 32) & ~31) - 1;
99 continue;
101 if ((bitmap & (1 << (buf_no & 31))) == 0)
102 continue;
103 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
104 KKASSERT(r == buf_no);
108 * Scan the cluster's clu_record_buf_bitmap, reassign buffers
109 * from alist_master to alist_record, and reallocate individual
110 * records and any related data reference if they meet the criteria.
112 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
113 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
114 if (bitmap == 0) {
115 buf_no = ((buf_no + 32) & ~31) - 1;
116 continue;
118 if ((bitmap & (1 << (buf_no & 31))) == 0)
119 continue;
120 hammer_recover_buffer_stage1(cluster, buf_no);
122 #endif
125 * The cluster is now in good enough shape that general allocations
126 * are possible. Construct an empty B-Tree root.
129 hammer_node_t croot;
130 int error;
132 croot = hammer_alloc_btree(cluster, &error);
133 if (error == 0) {
134 hammer_modify_node(croot);
135 bzero(croot->ondisk, sizeof(*croot->ondisk));
136 croot->ondisk->count = 0;
137 croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
138 cluster->ondisk->clu_btree_root = croot->node_offset;
139 hammer_rel_node(croot);
141 KKASSERT(error == 0);
145 * Scan the cluster's clu_record_buf_bitmap again and regenerate
146 * the B-Tree.
148 buffer_count = 0;
149 record_count = 0;
151 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
152 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
153 if (bitmap == 0) {
154 buf_no = ((buf_no + 32) & ~31) - 1;
155 continue;
157 if ((bitmap & (1 << (buf_no & 31))) == 0)
158 continue;
159 ++buffer_count;
160 record_count += hammer_recover_buffer_stage2(cluster, buf_no);
162 kprintf("HAMMER_RECOVER DONE %d:%d buffers=%d records=%d\n",
163 cluster->volume->vol_no, cluster->clu_no,
164 buffer_count, record_count);
167 * Validate the parent cluster pointer. XXX
169 return(0);
173 * Reassign buf_no as a record buffer and recover any related data
174 * references.
176 * This is used in the alist callback and must return a negative error
177 * code or a positive free block count.
180 buffer_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
182 hammer_cluster_t cluster;
183 hammer_record_ondisk_t rec;
184 hammer_buffer_t buffer;
185 int32_t buf_no;
186 int32_t rec_no;
187 int32_t rec_offset;
188 int32_t r;
189 int error;
192 * Extract cluster and buffer number to recover
194 cluster = info;
195 buf_no = blk / HAMMER_FSBUF_MAXBLKS;
198 * Mark the buffer as allocated in the cluster's master A-list.
200 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
201 KKASSERT(r == buf_no);
203 kprintf("recover buffer1 %d:%d:%d\n",
204 cluster->volume->vol_no,
205 cluster->clu_no, buf_no);
206 buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
207 if (error) {
209 * If we are unable to access the buffer leave it in a
210 * reserved state on the master alist.
212 kprintf("hammer_recover_buffer_stage1: error "
213 "recovering %d:%d:%d\n",
214 cluster->volume->vol_no, cluster->clu_no, buf_no);
215 return(-error);
219 * Recover the buffer, scan and validate allocated records. Records
220 * which cannot be recovered are freed.
222 * The parent a-list must be properly adjusted so don't just call
223 * hammer_alist_recover() on the underlying buffer. Go through the
224 * parent.
226 hammer_modify_buffer(buffer);
227 count = hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES);
228 kprintf("hammer_recover_buffer count1 %d/%d\n", count, HAMMER_RECORD_NODES);
229 rec_no = -1;
230 for (;;) {
231 rec_no = hammer_alist_find(&buffer->alist, rec_no + 1,
232 HAMMER_RECORD_NODES);
233 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
234 break;
235 #if 0
236 kprintf("recover record %d:%d:%d %d\n",
237 cluster->volume->vol_no,
238 cluster->clu_no, buf_no, rec_no);
239 #endif
240 rec_offset = offsetof(union hammer_fsbuf_ondisk,
241 record.recs[rec_no]);
242 rec_offset += buf_no * HAMMER_BUFSIZE;
243 rec = &buffer->ondisk->record.recs[rec_no];
244 error = hammer_recover_record(cluster, buffer, rec_offset, rec);
245 if (error) {
246 kprintf("hammer_recover_record: failed %d:%d@%d\n",
247 cluster->clu_no, buffer->buf_no, rec_offset);
248 hammer_alist_free(&buffer->alist, rec_no, 1);
249 ++count; /* free count */
252 kprintf("hammer_recover_buffer count2 %d/%d\n", count, HAMMER_RECORD_NODES);
253 hammer_rel_buffer(buffer, 0);
254 return(count);
258 * Recover a record, at least into a state that doesn't blow up the
259 * filesystem. Returns 0 on success, non-zero if the record is
260 * unrecoverable.
262 static int
263 hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer,
264 int32_t rec_offset, hammer_record_ondisk_t rec)
266 hammer_buffer_t dbuf;
267 u_int64_t syncid = cluster->ondisk->synchronized_rec_id;
268 int32_t data_offset;
269 int32_t data_len;
270 int32_t nblks;
271 int32_t dbuf_no;
272 int32_t dblk_no;
273 int32_t base_blk;
274 int32_t r;
275 int error = 0;
278 * We have to discard any records with rec_id's greater then the
279 * last sync of the cluster header (which guarenteed all related
280 * buffers had been synced). Otherwise the record may reference
281 * information that was never synced to disk.
283 if (rec->base.rec_id >= syncid) {
284 kprintf("recover record: syncid too large %016llx/%016llx\n",
285 rec->base.rec_id, syncid);
286 return(EINVAL);
289 #if 0
290 /* XXX undo incomplete deletions */
291 if (rec->base.base.delete_tid > syncid)
292 rec->base.base.delete_tid = 0;
293 #endif
296 * Validate the record's B-Tree key
298 if (rec->base.base.rec_type != HAMMER_RECTYPE_CLUSTER) {
299 if (hammer_btree_cmp(&rec->base.base,
300 &cluster->ondisk->clu_btree_beg) < 0) {
301 kprintf("recover record: range low\n");
302 return(EINVAL);
304 if (hammer_btree_cmp(&rec->base.base,
305 &cluster->ondisk->clu_btree_end) >= 0) {
306 kprintf("recover record: range high\n");
307 return(EINVAL);
312 * Validate the record's data. If the offset is 0 there is no data
313 * (or it is zero-fill) and we can return success immediately.
314 * Otherwise make sure everything is ok.
316 data_offset = rec->base.data_offset;
317 data_len = rec->base.data_len;
319 if (data_len == 0)
320 rec->base.data_offset = data_offset = 0;
321 if (data_offset == 0)
322 goto done;
325 * Non-zero data offset, recover the data
327 if (data_offset < HAMMER_BUFSIZE ||
328 data_offset >= cluster->ondisk->clu_limit ||
329 data_len < 0 || data_len > HAMMER_MAXDATA ||
330 data_offset + data_len > cluster->ondisk->clu_limit) {
331 kprintf("recover record: bad offset/len %d/%d\n",
332 data_offset, data_len);
333 return(EINVAL);
337 * Check data_offset relative to rec_offset
339 if (data_offset < rec_offset && data_offset + data_len > rec_offset) {
340 kprintf("recover record: bad offset: overlapping1\n");
341 return(EINVAL);
343 if (data_offset >= rec_offset &&
344 data_offset < rec_offset + sizeof(struct hammer_base_record)) {
345 kprintf("recover record: bad offset: overlapping2\n");
346 return(EINVAL);
350 * Check for data embedded in the record
352 if (data_offset >= rec_offset &&
353 data_offset < rec_offset + HAMMER_RECORD_SIZE) {
354 if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE) {
355 kprintf("recover record: bad offset: overlapping3\n");
356 return(EINVAL);
358 goto done;
362 * Recover the allocated data either out of the cluster's master alist
363 * or as a buffer sub-allocation.
365 if ((data_len & HAMMER_BUFMASK) == 0) {
366 if (data_offset & HAMMER_BUFMASK) {
367 kprintf("recover record: bad offset: unaligned\n");
368 return(EINVAL);
370 nblks = data_len / HAMMER_BUFSIZE;
371 dbuf_no = data_offset / HAMMER_BUFSIZE;
373 r = hammer_alist_alloc_fwd(&cluster->alist_master,
374 nblks, dbuf_no);
375 if (r == HAMMER_ALIST_BLOCK_NONE) {
376 kprintf("recover record: cannot recover offset1\n");
377 return(EINVAL);
379 if (r != dbuf_no) {
380 kprintf("recover record: cannot recover offset2\n");
381 hammer_alist_free(&cluster->alist_master, r, nblks);
382 return(EINVAL);
384 } else {
385 if ((data_offset & ~HAMMER_BUFMASK) !=
386 ((data_offset + data_len - 1) & ~HAMMER_BUFMASK)) {
387 kprintf("recover record: overlaps multiple bufs\n");
388 return(EINVAL);
390 if ((data_offset & HAMMER_BUFMASK) <
391 sizeof(struct hammer_fsbuf_head)) {
392 kprintf("recover record: data in header area\n");
393 return(EINVAL);
395 if (data_offset & HAMMER_DATA_BLKMASK) {
396 kprintf("recover record: data blk unaligned\n");
397 return(EINVAL);
401 * Ok, recover the space in the data buffer.
403 dbuf_no = data_offset / HAMMER_BUFSIZE;
404 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no);
405 if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE)
406 hammer_alist_free(&cluster->alist_master, r, 1);
407 if (r == dbuf_no) {
409 * This is the first time we've tried to recover
410 * data in this data buffer, reinit it (but don't
411 * zero it out, obviously).
413 * Calling initbuffer marks the data blocks within
414 * the buffer as being free.
416 dbuf = hammer_get_buffer(cluster, dbuf_no,
417 0, &error);
418 if (error == 0) {
419 hammer_modify_buffer(dbuf);
420 hammer_initbuffer(&dbuf->alist,
421 &dbuf->ondisk->head,
422 HAMMER_FSBUF_DATA);
423 dbuf->buf_type = HAMMER_FSBUF_DATA;
425 base_blk = dbuf_no * HAMMER_FSBUF_MAXBLKS;
426 hammer_alist_free(&cluster->alist_mdata, base_blk,
427 HAMMER_DATA_NODES);
428 } else {
430 * We've seen this data buffer before.
432 dbuf = hammer_get_buffer(cluster, dbuf_no,
433 0, &error);
435 if (error) {
436 kprintf("recover record: data: getbuf failed\n");
437 return(EINVAL);
440 if (dbuf->buf_type != HAMMER_FSBUF_DATA) {
441 hammer_rel_buffer(dbuf, 0);
442 kprintf("recover record: data: wrong buffer type\n");
443 return(EINVAL);
447 * Figure out the data block number and number of blocks.
449 nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
450 nblks /= HAMMER_DATA_BLKSIZE;
451 dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE;
452 if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) {
453 kprintf("dblk_no %d does not match data_offset %d/%d\n",
454 dblk_no,
455 offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]),
456 (data_offset & HAMMER_BUFMASK));
457 hammer_rel_buffer(dbuf, 0);
458 kprintf("recover record: data: not block aligned\n");
459 Debugger("bad data");
460 return(EINVAL);
462 dblk_no += dbuf_no * HAMMER_FSBUF_MAXBLKS;
463 r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no);
464 if (r != dblk_no) {
465 if (r != HAMMER_ALIST_BLOCK_NONE)
466 hammer_alist_free(&cluster->alist_mdata, r, nblks);
467 hammer_rel_buffer(dbuf, 0);
468 kprintf("recover record: data: unable to realloc\n");
469 return(EINVAL);
471 hammer_rel_buffer(dbuf, 0);
473 done:
474 return(0);
478 * Rebuild the B-Tree for the records residing in the specified buffer.
480 * Return the number of records recovered.
482 static int
483 hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no)
485 hammer_record_ondisk_t rec;
486 hammer_buffer_t buffer;
487 int32_t rec_no;
488 int32_t rec_offset;
489 int record_count = 0;
490 int error;
492 buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
493 if (error) {
495 * If we are unable to access the buffer leave it in a
496 * reserved state on the master alist.
498 kprintf("hammer_recover_buffer_stage2: error "
499 "recovering %d:%d:%d\n",
500 cluster->volume->vol_no, cluster->clu_no, buf_no);
501 return(0);
505 * Recover the buffer, scan and validate allocated records. Records
506 * which cannot be recovered are freed.
508 rec_no = -1;
509 for (;;) {
510 rec_no = hammer_alist_find(&buffer->alist, rec_no + 1,
511 HAMMER_RECORD_NODES);
512 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
513 break;
514 rec_offset = offsetof(union hammer_fsbuf_ondisk,
515 record.recs[rec_no]);
516 rec_offset += buf_no * HAMMER_BUFSIZE;
517 rec = &buffer->ondisk->record.recs[rec_no];
518 error = hammer_recover_btree(cluster, buffer, rec_offset, rec);
519 if (error) {
520 kprintf("hammer_recover_btree: failed %d:%d@%d\n",
521 cluster->clu_no, buffer->buf_no, rec_offset);
522 /* XXX free the record and its data? */
523 /*hammer_alist_free(&buffer->alist, rec_no, 1);*/
524 } else {
525 ++record_count;
528 hammer_rel_buffer(buffer, 0);
529 return(record_count);
533 * Enter a single record into the B-Tree.
535 static int
536 hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer,
537 int32_t rec_offset, hammer_record_ondisk_t rec)
539 struct hammer_cursor cursor;
540 union hammer_btree_elm elm;
541 hammer_cluster_t ncluster;
542 int error = 0;
545 * Check for a spike record. When spiking into a new cluster do
546 * NOT allow a recursive recovery to occur. We use a lot of
547 * stack and the only thing we actually modify in the target
548 * cluster is its parent pointer.
550 if (rec->base.base.rec_type == HAMMER_RECTYPE_CLUSTER) {
551 hammer_volume_t ovolume = cluster->volume;
552 hammer_volume_t nvolume;
554 nvolume = hammer_get_volume(ovolume->hmp, rec->spike.vol_no,
555 &error);
556 if (error)
557 return(error);
558 ncluster = hammer_get_cluster(nvolume, rec->spike.clu_no,
559 &error, GET_CLUSTER_NORECOVER);
560 hammer_rel_volume(nvolume, 0);
561 if (error)
562 return(error);
565 * Validate the cluster. Allow the offset to be fixed up.
567 if (ncluster->ondisk->clu_btree_parent_vol_no != ovolume->vol_no ||
568 ncluster->ondisk->clu_btree_parent_clu_no != cluster->clu_no) {
569 kprintf("hammer_recover: Bad cluster spike hookup: "
570 "%d:%d != %d:%d\n",
571 ncluster->ondisk->clu_btree_parent_vol_no,
572 ncluster->ondisk->clu_btree_parent_clu_no,
573 ovolume->vol_no,
574 ovolume->vol_no);
575 error = EINVAL;
576 hammer_rel_cluster(ncluster, 0);
577 return(error);
579 } else {
580 ncluster = NULL;
584 * Locate the insertion point. Note that we are using the cluster-
585 * localized cursor init so parent will start out NULL.
587 * The key(s) used for spike's are bounds and different from the
588 * key embedded in the spike record. A special B-Tree insertion
589 * call is made to deal with spikes.
591 error = hammer_init_cursor_cluster(&cursor, cluster);
592 if (error)
593 goto failed;
594 KKASSERT(cursor.node);
595 if (ncluster)
596 cursor.key_beg = ncluster->ondisk->clu_btree_beg;
597 else
598 cursor.key_beg = rec->base.base;
599 cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_RECOVER;
601 error = hammer_btree_lookup(&cursor);
602 KKASSERT(error != EDEADLK);
603 KKASSERT(cursor.node);
604 if (error == 0) {
605 kprintf("hammer_recover_btree: Duplicate record cursor %p rec %p ncluster %p\n",
606 &cursor, rec, ncluster);
607 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index);
608 Debugger("duplicate record");
610 if (error != ENOENT)
611 goto failed;
613 if (ncluster) {
615 * Spike record
617 kprintf("recover spike clu %d %016llx-%016llx\n",
618 ncluster->clu_no,
619 ncluster->ondisk->clu_btree_beg.obj_id,
620 ncluster->ondisk->clu_btree_end.obj_id);
621 error = hammer_btree_insert_cluster(&cursor, ncluster,
622 rec_offset);
623 kprintf("recover spike record error %d\n", error);
624 KKASSERT(error != EDEADLK);
625 if (error)
626 Debugger("spike recovery");
627 } else {
629 * Normal record
631 #if 0
632 kprintf("recover recrd clu %d %016llx\n",
633 cluster->clu_no, rec->base.base.obj_id);
634 #endif
635 elm.leaf.base = rec->base.base;
636 elm.leaf.rec_offset = rec_offset;
637 elm.leaf.data_offset = rec->base.data_offset;
638 elm.leaf.data_len = rec->base.data_len;
639 elm.leaf.data_crc = rec->base.data_crc;
641 error = hammer_btree_insert(&cursor, &elm);
642 KKASSERT(error != EDEADLK);
645 if (error) {
646 kprintf("hammer_recover_btree: insertion failed\n");
649 failed:
650 if (ncluster)
651 hammer_rel_cluster(ncluster, 0);
652 hammer_done_cursor(&cursor);
653 return(error);