HAMMER 17/many: Refactor IO backend, clean up buffer cache deadlocks.
[dragonfly.git] / sys / vfs / hammer / hammer_recover.c
blob509ac56a4d067a6dddcd1b5750efd66269c61f6b
1 /*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_recover.c,v 1.2 2008/01/10 07:41:03 dillon Exp $
37 #include "hammer.h"
39 static void hammer_recover_buffer_stage1(hammer_cluster_t cluster,
40 int32_t buf_no);
41 static void hammer_recover_buffer_stage2(hammer_cluster_t cluster,
42 int32_t buf_no);
43 static int hammer_recover_record(hammer_cluster_t cluster,
44 hammer_buffer_t buffer, int32_t rec_offset,
45 hammer_record_ondisk_t rec);
46 static int hammer_recover_btree(hammer_cluster_t cluster,
47 hammer_buffer_t buffer, int32_t rec_offset,
48 hammer_record_ondisk_t rec);
51 * Recover a cluster. The caller has referenced and locked the cluster.
53 * Generally returns 0 on success and EIO if the recovery was unsuccessful.
55 int
56 hammer_recover(hammer_cluster_t cluster)
58 int buf_no;
59 int nbuffers;
60 int32_t r;
61 u_int32_t bitmap;
63 return(0); /* XXX temporarily disabled */
64 Debugger("hammer_recover");
65 KKASSERT(cluster->ondisk->synchronized_tid);
67 nbuffers = cluster->ondisk->clu_limit / HAMMER_BUFSIZE;
68 hammer_modify_cluster(cluster);
71 * Re-initialize the A-lists.
73 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
74 HAMMER_ASTATE_FREE);
75 hammer_alist_init(&cluster->alist_btree,
76 HAMMER_FSBUF_MAXBLKS,
77 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
78 HAMMER_ASTATE_ALLOC);
79 hammer_alist_init(&cluster->alist_mdata,
80 HAMMER_FSBUF_MAXBLKS,
81 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
82 HAMMER_ASTATE_ALLOC);
83 hammer_alist_init(&cluster->alist_record,
84 HAMMER_FSBUF_MAXBLKS,
85 (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
86 HAMMER_ASTATE_ALLOC);
89 * Scan the cluster's clu_record_buf_bitmap, reserve record buffers
90 * from the master bitmap before we try to recover their data. Free
91 * the block of records to alist_record.
93 * We need to mark the blocks as free in alist_record so future
94 * allocations will dive into the buffer A-list's, but we don't
95 * want to destroy the underlying buffer A-list's. Because the
96 * meta data in cluster->alist_record indicates state 00 (all-allocated
97 * but not initialized), it will not dive down into the buffer when
98 * freeing the entire buffer.
100 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
101 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
102 if (bitmap == 0) {
103 buf_no = ((buf_no + 32) & ~31) - 1;
104 continue;
106 if ((bitmap & (1 << (buf_no & 31))) == 0)
107 continue;
108 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, buf_no);
109 KKASSERT(r == buf_no);
110 hammer_alist_free(&cluster->alist_record,
111 buf_no * HAMMER_FSBUF_MAXBLKS,
112 HAMMER_FSBUF_MAXBLKS);
116 * Scan the cluster's clu_record_buf_bitmap, reassign buffers
117 * from alist_master to alist_record, and reallocate individual
118 * records and any related data reference if they meet the critera.
120 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
121 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
122 if (bitmap == 0) {
123 buf_no = ((buf_no + 32) & ~31) - 1;
124 continue;
126 if ((bitmap & (1 << (buf_no & 31))) == 0)
127 continue;
128 hammer_recover_buffer_stage1(cluster, buf_no);
132 * The cluster is now in good enough shape that general allocations
133 * are possible. Construct an empty B-Tree root.
136 hammer_node_t croot;
137 int error;
139 croot = hammer_alloc_btree(cluster, &error);
140 if (error == 0) {
141 hammer_modify_node(croot);
142 bzero(croot->ondisk, sizeof(*croot->ondisk));
143 croot->ondisk->count = 0;
144 croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
145 cluster->ondisk->clu_btree_root = croot->node_offset;
150 * Scan the cluster's clu_record_buf_bitmap again and regenerate
151 * the B-Tree.
153 * XXX B-tree record for cluster-push
155 for (buf_no = 1; buf_no < nbuffers; ++buf_no) {
156 bitmap = cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5];
157 if (bitmap == 0) {
158 buf_no = ((buf_no + 32) & ~31) - 1;
159 continue;
161 if ((bitmap & (1 << (buf_no & 31))) == 0)
162 continue;
163 hammer_recover_buffer_stage2(cluster, buf_no);
167 * Validate the parent cluster pointer. XXX
169 return(0);
173 * Reassign buf_no as a record buffer and recover any related data
174 * references.
176 static void
177 hammer_recover_buffer_stage1(hammer_cluster_t cluster, int32_t buf_no)
179 hammer_record_ondisk_t rec;
180 hammer_buffer_t buffer;
181 int32_t rec_no;
182 int32_t rec_offset;
183 int error;
185 buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
186 if (error) {
188 * If we are unable to access the buffer leave it in a
189 * reserved state on the master alist.
191 kprintf("hammer_recover_buffer_stage1: error "
192 "recovering %d:%d:%d\n",
193 cluster->volume->vol_no, cluster->clu_no, buf_no);
194 return;
198 * Recover the buffer, scan and validate allocated records. Records
199 * which cannot be recovered are freed.
201 hammer_modify_buffer(buffer);
202 hammer_alist_recover(&buffer->alist, 0, 0, HAMMER_RECORD_NODES);
203 rec_no = -1;
204 for (;;) {
205 rec_no = hammer_alist_find(&buffer->alist, rec_no + 1);
206 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
207 break;
208 rec_offset = offsetof(union hammer_fsbuf_ondisk,
209 record.recs[rec_no]);
210 rec_offset += buf_no * HAMMER_BUFSIZE;
211 rec = &buffer->ondisk->record.recs[rec_no];
212 error = hammer_recover_record(cluster, buffer, rec_offset, rec);
213 if (error) {
214 kprintf("hammer_recover_record: failed %d:%d@%d\n",
215 cluster->clu_no, buffer->buf_no, rec_offset);
216 hammer_alist_free(&buffer->alist, rec_no, 1);
219 hammer_rel_buffer(buffer, 0);
223 * Recover a record, at least into a state that doesn't blow up the
224 * filesystem. Returns 0 on success, non-zero if the record is
225 * unrecoverable.
227 static int
228 hammer_recover_record(hammer_cluster_t cluster, hammer_buffer_t buffer,
229 int32_t rec_offset, hammer_record_ondisk_t rec)
231 hammer_buffer_t dbuf;
232 hammer_tid_t syncid = cluster->ondisk->synchronized_tid;
233 int32_t data_offset;
234 int32_t data_len;
235 int32_t nblks;
236 int32_t dbuf_no;
237 int32_t dblk_no;
238 int32_t r;
239 int error = 0;
242 * Discard records created after the synchronization point and
243 * undo any deletions made after the synchronization point.
245 if (rec->base.base.create_tid > syncid)
246 return(EINVAL);
248 if (rec->base.base.delete_tid > syncid)
249 rec->base.base.delete_tid = 0;
252 * Validate the record's B-Tree key
254 if (hammer_btree_cmp(&rec->base.base,
255 &cluster->ondisk->clu_btree_beg) < 0) {
256 return(EINVAL);
258 if (hammer_btree_cmp(&rec->base.base,
259 &cluster->ondisk->clu_btree_end) >= 0) {
260 return(EINVAL);
264 * Validate the record's data. If the offset is 0 there is no data
265 * (or it is zero-fill) and we can return success immediately.
266 * Otherwise make sure everything is ok.
268 data_offset = rec->base.data_offset;
269 data_len = rec->base.data_len;
271 if (data_len == 0)
272 rec->base.data_offset = 0;
273 if (data_offset == 0)
274 return(0);
275 if (data_offset < HAMMER_BUFSIZE ||
276 data_offset >= cluster->ondisk->clu_limit ||
277 data_len < 0 || data_len > HAMMER_MAXDATA ||
278 data_offset + data_len > cluster->ondisk->clu_limit) {
279 return(EINVAL);
283 * Check data_offset relative to rec_offset
285 if (data_offset < rec_offset && data_offset + data_len > rec_offset)
286 return(EINVAL);
287 if (data_offset >= rec_offset &&
288 data_offset < rec_offset + sizeof(struct hammer_base_record)) {
289 return(EINVAL);
293 * Check for data embedded in the record
295 if (data_offset >= rec_offset &&
296 data_offset < rec_offset + HAMMER_RECORD_SIZE) {
297 if (data_offset + data_len > rec_offset + HAMMER_RECORD_SIZE)
298 return(EINVAL);
299 return(0);
303 * Recover the allocated data either out of the cluster's master alist
304 * or as a buffer sub-allocation.
306 if ((data_len & HAMMER_BUFMASK) == 0) {
307 if (data_offset & HAMMER_BUFMASK)
308 return(EINVAL);
309 nblks = data_len / HAMMER_BUFSIZE;
310 dbuf_no = data_offset / HAMMER_BUFSIZE;
312 r = hammer_alist_alloc_fwd(&cluster->alist_master,
313 nblks, dbuf_no);
314 if (r == HAMMER_ALIST_BLOCK_NONE)
315 return(EINVAL);
316 if (r != dbuf_no) {
317 hammer_alist_free(&cluster->alist_master, r, nblks);
318 return(EINVAL);
320 } else {
321 if ((data_offset & ~HAMMER_BUFMASK) !=
322 ((data_offset + data_len) & ~HAMMER_BUFMASK)) {
323 return(EINVAL);
325 if ((data_offset & HAMMER_BUFMASK) <
326 sizeof(struct hammer_fsbuf_head)) {
327 return(EINVAL);
329 if (data_offset & HAMMER_DATA_BLKMASK)
330 return(EINVAL);
333 * Ok, recover the space in the data buffer.
335 dbuf_no = data_offset / HAMMER_BUFSIZE;
336 r = hammer_alist_alloc_fwd(&cluster->alist_master, 1, dbuf_no);
337 if (r != dbuf_no && r != HAMMER_ALIST_BLOCK_NONE)
338 hammer_alist_free(&cluster->alist_master, r, 1);
339 if (r == dbuf_no) {
341 * This is the first time we've tried to recover
342 * data in this data buffer, reinit it (but don't
343 * zero it out, obviously).
345 * Calling initbuffer marks the data blocks within
346 * the buffer as being free.
348 dbuf = hammer_get_buffer(cluster, dbuf_no,
349 0, &error);
350 if (error == 0) {
351 hammer_modify_buffer(dbuf);
352 hammer_initbuffer(&dbuf->alist,
353 &dbuf->ondisk->head,
354 HAMMER_FSBUF_DATA);
355 dbuf->buf_type = HAMMER_FSBUF_DATA;
357 } else {
359 * We've seen this data buffer before.
361 dbuf = hammer_get_buffer(cluster, dbuf_no,
362 0, &error);
364 if (error)
365 return(EINVAL);
367 if (dbuf->buf_type != HAMMER_FSBUF_DATA) {
368 hammer_rel_buffer(dbuf, 0);
369 return(EINVAL);
373 * Figure out the data block number and number of blocks.
375 nblks = (data_len + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
376 nblks /= HAMMER_DATA_BLKSIZE;
377 dblk_no = ((data_offset & HAMMER_BUFMASK) - offsetof(union hammer_fsbuf_ondisk, data.data)) / HAMMER_DATA_BLKSIZE;
378 if ((data_offset & HAMMER_BUFMASK) != offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no])) {
379 kprintf("dblk_no %d does not match data_offset %d/%d\n",
380 dblk_no,
381 offsetof(union hammer_fsbuf_ondisk, data.data[dblk_no]),
382 (data_offset & HAMMER_BUFMASK));
383 hammer_rel_buffer(dbuf, 0);
384 Debugger("bad data");
385 return(EINVAL);
387 dblk_no *= HAMMER_FSBUF_MAXBLKS;
388 r = hammer_alist_alloc_fwd(&cluster->alist_mdata, nblks, dblk_no);
389 if (r != dblk_no) {
390 if (r != HAMMER_ALIST_BLOCK_NONE)
391 hammer_alist_free(&cluster->alist_mdata, r, nblks);
392 hammer_rel_buffer(dbuf, 0);
393 return(EINVAL);
395 hammer_rel_buffer(dbuf, 0);
397 return(0);
401 * Rebuild the B-Ttree for the records residing in the specified buffer.
403 static void
404 hammer_recover_buffer_stage2(hammer_cluster_t cluster, int32_t buf_no)
406 hammer_record_ondisk_t rec;
407 hammer_buffer_t buffer;
408 int32_t rec_no;
409 int32_t rec_offset;
410 int error;
412 buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
413 if (error) {
415 * If we are unable to access the buffer leave it in a
416 * reserved state on the master alist.
418 kprintf("hammer_recover_buffer_stage2: error "
419 "recovering %d:%d:%d\n",
420 cluster->volume->vol_no, cluster->clu_no, buf_no);
421 return;
425 * Recover the buffer, scan and validate allocated records. Records
426 * which cannot be recovered are freed.
428 rec_no = -1;
429 for (;;) {
430 rec_no = hammer_alist_find(&buffer->alist, rec_no + 1);
431 if (rec_no == HAMMER_ALIST_BLOCK_NONE)
432 break;
433 rec_offset = offsetof(union hammer_fsbuf_ondisk,
434 record.recs[rec_no]);
435 rec_offset += buf_no * HAMMER_BUFSIZE;
436 rec = &buffer->ondisk->record.recs[rec_no];
437 error = hammer_recover_btree(cluster, buffer, rec_offset, rec);
438 if (error) {
439 kprintf("hammer_recover_btree: failed %d:%d@%d\n",
440 cluster->clu_no, buffer->buf_no, rec_offset);
441 /* XXX free the record and its data? */
442 /*hammer_alist_free(&buffer->alist, rec_no, 1);*/
445 hammer_rel_buffer(buffer, 0);
448 static int
449 hammer_recover_btree(hammer_cluster_t cluster, hammer_buffer_t buffer,
450 int32_t rec_offset, hammer_record_ondisk_t rec)
452 struct hammer_cursor cursor;
453 union hammer_btree_elm elm;
454 int error;
456 error = hammer_init_cursor_cluster(&cursor, cluster);
457 if (error)
458 goto failed;
459 cursor.key_beg = rec->base.base;
460 cursor.flags = HAMMER_CURSOR_INSERT;
461 error = hammer_btree_lookup(&cursor);
462 if (error == 0) {
463 kprintf("hammer_recover_btree: Duplicate record\n");
464 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], HAMMER_BTREE_TYPE_LEAF, cursor.index);
465 Debugger("duplicate record");
467 if (error != ENOENT)
468 goto failed;
470 elm.leaf.base = rec->base.base;
471 elm.leaf.rec_offset = rec_offset;
472 elm.leaf.data_offset = rec->base.data_offset;
473 elm.leaf.data_len = rec->base.data_len;
474 elm.leaf.data_crc = rec->base.data_crc;
476 error = hammer_btree_insert(&cursor, &elm);
477 if (error) {
478 kprintf("hammer_recover_btree: insertion failed\n");
480 /* XXX cluster pushes? */
482 failed:
483 hammer_done_cursor(&cursor);
484 return(error);