HAMMER 60F/Many: Mirroring
[dragonfly.git] / sys / vfs / hammer / hammer_mirror.c
blob8cdd83ecbc773eb1de14a3be88ef6dbc20b33c6b
1 /*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.8 2008/07/07 03:49:51 dillon Exp $
37 * HAMMER mirroring ioctls - serialize and deserialize modifications made
38 * to a filesystem.
41 #include "hammer.h"
43 static int hammer_mirror_check(hammer_cursor_t cursor,
44 struct hammer_ioc_mrecord *mrec);
45 static int hammer_mirror_update(hammer_cursor_t cursor,
46 struct hammer_ioc_mrecord *mrec);
47 static int hammer_mirror_write(hammer_cursor_t cursor,
48 struct hammer_ioc_mrecord *mrec,
49 hammer_inode_t ip, char *udata);
50 static int hammer_mirror_localize_data(hammer_data_ondisk_t data,
51 hammer_btree_leaf_elm_t leaf);
54 * All B-Tree records within the specified key range which also conform
55 * to the transaction id range are returned. Mirroring code keeps track
56 * of the last transaction id fully scanned and can efficiently pick up
57 * where it left off if interrupted.
59 int
60 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
61 struct hammer_ioc_mirror_rw *mirror)
63 struct hammer_cursor cursor;
64 struct hammer_ioc_mrecord mrec;
65 hammer_btree_leaf_elm_t elm;
66 const int head_size = HAMMER_MREC_HEADSIZE;
67 const int crc_start = HAMMER_MREC_CRCOFF;
68 char *uptr;
69 int error;
70 int data_len;
71 int bytes;
73 if ((mirror->key_beg.localization | mirror->key_end.localization) &
74 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
75 return(EINVAL);
77 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0)
78 return(EINVAL);
80 mirror->key_cur = mirror->key_beg;
81 mirror->key_cur.localization += ip->obj_localization;
82 bzero(&mrec, sizeof(mrec));
84 retry:
85 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
86 if (error) {
87 hammer_done_cursor(&cursor);
88 goto failed;
90 cursor.key_beg = mirror->key_cur;
91 cursor.key_end = mirror->key_end;
92 cursor.key_end.localization += ip->obj_localization;
94 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
95 cursor.flags |= HAMMER_CURSOR_BACKEND;
98 * This flag filters the search to only return elements whos create
99 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid
100 * field stored with internal and leaf nodes to shortcut the scan.
102 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
103 cursor.mirror_tid = mirror->tid_beg;
105 error = hammer_btree_first(&cursor);
106 while (error == 0) {
108 * Leaf node. Only return elements modified in the range
109 * requested by userland.
111 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
112 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
114 if (elm->base.create_tid < mirror->tid_beg ||
115 elm->base.create_tid >= mirror->tid_end) {
116 if (elm->base.delete_tid < mirror->tid_beg ||
117 elm->base.delete_tid >= mirror->tid_end) {
118 goto skip;
122 mirror->key_cur = elm->base;
125 * Yield to more important tasks
127 if ((error = hammer_signal_check(trans->hmp)) != 0)
128 break;
129 if (trans->hmp->sync_lock.wanted) {
130 tsleep(trans, 0, "hmrslo", hz / 10);
132 if (trans->hmp->locked_dirty_space +
133 trans->hmp->io_running_space > hammer_limit_dirtybufspace) {
134 hammer_flusher_async(trans->hmp);
135 tsleep(trans, 0, "hmrslo", hz / 10);
139 * The core code exports the data to userland.
141 data_len = (elm->data_offset) ? elm->data_len : 0;
142 if (data_len) {
143 error = hammer_btree_extract(&cursor,
144 HAMMER_CURSOR_GET_DATA);
145 if (error)
146 break;
148 bytes = sizeof(struct hammer_ioc_mrecord) + data_len;
149 bytes = (bytes + HAMMER_HEAD_ALIGN_MASK) &
150 ~HAMMER_HEAD_ALIGN_MASK;
151 if (mirror->count + bytes > mirror->size)
152 break;
155 * Construct the record for userland and copyout.
157 * The user is asking for a snapshot, if the record was
158 * deleted beyond the user-requested ending tid, the record
159 * is not considered deleted from the point of view of
160 * userland and delete_tid is cleared.
162 mrec.signature = HAMMER_IOC_MIRROR_SIGNATURE;
163 mrec.type = HAMMER_MREC_TYPE_REC;
164 mrec.rec_size = bytes;
165 mrec.leaf = *elm;
166 if (elm->base.delete_tid >= mirror->tid_end)
167 mrec.leaf.base.delete_tid = 0;
168 mrec.rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
169 uptr = (char *)mirror->ubuf + mirror->count;
170 error = copyout(&mrec, uptr, head_size);
171 if (data_len && error == 0) {
172 error = copyout(cursor.data, uptr + head_size,
173 data_len);
175 if (error == 0)
176 mirror->count += bytes;
177 skip:
178 if (error == 0) {
179 cursor.flags |= HAMMER_CURSOR_ATEDISK;
180 error = hammer_btree_iterate(&cursor);
183 if (error == ENOENT) {
184 mirror->key_cur = mirror->key_end;
185 error = 0;
187 hammer_done_cursor(&cursor);
188 if (error == EDEADLK)
189 goto retry;
190 if (error == EINTR) {
191 mirror->head.flags |= HAMMER_IOC_HEAD_INTR;
192 error = 0;
194 failed:
195 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
196 return(error);
200 * Copy records from userland to the target mirror. Records which already
201 * exist may only have their delete_tid updated.
203 * The passed ip is the root ip of the pseudofs
206 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
207 struct hammer_ioc_mirror_rw *mirror)
209 struct hammer_cursor cursor;
210 struct hammer_ioc_mrecord mrec;
211 const int head_size = HAMMER_MREC_HEADSIZE;
212 const int crc_start = HAMMER_MREC_CRCOFF;
213 u_int32_t rec_crc;
214 int error;
215 char *uptr;
217 if (mirror->size < 0 || mirror->size > 0x70000000)
218 return(EINVAL);
220 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
221 retry:
222 hammer_normalize_cursor(&cursor);
224 while (error == 0 && mirror->count + head_size <= mirror->size) {
226 * Acquire and validate header
228 uptr = (char *)mirror->ubuf + mirror->count;
229 error = copyin(uptr, &mrec, head_size);
230 if (error)
231 break;
232 rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
233 if (mrec.signature != HAMMER_IOC_MIRROR_SIGNATURE) {
234 error = EINVAL;
235 break;
237 if (mrec.type != HAMMER_MREC_TYPE_REC) {
238 error = EINVAL;
239 break;
241 if (rec_crc != mrec.rec_crc) {
242 error = EINVAL;
243 break;
245 if (mrec.rec_size < head_size ||
246 mrec.rec_size > head_size + HAMMER_XBUFSIZE + 16 ||
247 mirror->count + mrec.rec_size > mirror->size) {
248 error = EINVAL;
249 break;
251 if (mrec.leaf.data_len < 0 ||
252 mrec.leaf.data_len > HAMMER_XBUFSIZE ||
253 sizeof(struct hammer_ioc_mrecord) + mrec.leaf.data_len > mrec.rec_size) {
254 error = EINVAL;
258 * Re-localize for target. relocalization of data is handled
259 * by hammer_mirror_write().
261 mrec.leaf.base.localization &= HAMMER_LOCALIZE_MASK;
262 mrec.leaf.base.localization += ip->obj_localization;
265 * Locate the record.
267 * If the record exists only the delete_tid may be updated.
269 * If the record does not exist we create it. For now we
270 * ignore records with a non-zero delete_tid. Note that
271 * mirror operations are effective an as-of operation and
272 * delete_tid can be 0 for mirroring purposes even if it is
273 * not actually 0 at the originator.
275 hammer_normalize_cursor(&cursor);
276 cursor.key_beg = mrec.leaf.base;
277 cursor.flags |= HAMMER_CURSOR_BACKEND;
278 cursor.flags &= ~HAMMER_CURSOR_INSERT;
279 error = hammer_btree_lookup(&cursor);
281 if (error == 0 && hammer_mirror_check(&cursor, &mrec)) {
282 hammer_sync_lock_sh(trans);
283 error = hammer_mirror_update(&cursor, &mrec);
284 hammer_sync_unlock(trans);
285 } else if (error == ENOENT && mrec.leaf.base.delete_tid == 0) {
286 hammer_sync_lock_sh(trans);
287 error = hammer_mirror_write(&cursor, &mrec, ip,
288 uptr + head_size);
289 hammer_sync_unlock(trans);
290 } else if (error == ENOENT) {
291 error = 0;
295 * Clean for loop. It is ok if the record already exists
296 * on the target.
298 if (error == EDEADLK) {
299 hammer_done_cursor(&cursor);
300 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
301 goto retry;
304 if (error == EALREADY)
305 error = 0;
306 if (error == 0)
307 mirror->count += mrec.rec_size;
309 hammer_done_cursor(&cursor);
310 return(0);
314 * Check whether an update is needed in the case where a match already
315 * exists on the target. The only type of update allowed in this case
316 * is an update of the delete_tid.
318 * Return non-zero if the update should proceed.
320 static
322 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
324 hammer_btree_leaf_elm_t leaf = cursor->leaf;
326 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) {
327 if (leaf->base.delete_tid != 0)
328 return(1);
330 return(0);
334 * Update a record in-place. Only the delete_tid can change.
336 static
338 hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
340 hammer_transaction_t trans;
341 hammer_btree_leaf_elm_t elm;
343 elm = cursor->leaf;
344 trans = cursor->trans;
346 if (mrec->leaf.base.delete_tid == 0) {
347 kprintf("mirror_write: object %016llx:%016llx deleted on "
348 "target, not deleted on source\n",
349 elm->base.obj_id, elm->base.key);
350 return(0);
353 KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid);
354 hammer_modify_node(trans, cursor->node, elm, sizeof(*elm));
355 elm->base.delete_tid = mrec->leaf.base.delete_tid;
356 elm->delete_ts = mrec->leaf.delete_ts;
357 hammer_modify_node_done(cursor->node);
360 * Track a count of active inodes.
362 if (elm->base.obj_type == HAMMER_RECTYPE_INODE) {
363 hammer_modify_volume_field(trans,
364 trans->rootvol,
365 vol0_stat_inodes);
366 --trans->hmp->rootvol->ondisk->vol0_stat_inodes;
367 hammer_modify_volume_done(trans->rootvol);
370 return(0);
374 * Write out a new record.
376 static
378 hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
379 hammer_inode_t ip, char *udata)
381 hammer_transaction_t trans;
382 hammer_buffer_t data_buffer;
383 hammer_off_t ndata_offset;
384 hammer_tid_t high_tid;
385 void *ndata;
386 int error;
387 int doprop;
390 * Skip records related to the root inode other then
391 * directory entries.
393 if (mrec->leaf.base.obj_id == HAMMER_OBJID_ROOT) {
394 if (mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE ||
395 mrec->leaf.base.rec_type == HAMMER_RECTYPE_FIX) {
396 return(0);
400 trans = cursor->trans;
401 data_buffer = NULL;
404 * Allocate and adjust data
406 if (mrec->leaf.data_len && mrec->leaf.data_offset) {
407 ndata = hammer_alloc_data(trans, mrec->leaf.data_len,
408 mrec->leaf.base.rec_type,
409 &ndata_offset, &data_buffer, &error);
410 if (ndata == NULL)
411 return(error);
412 mrec->leaf.data_offset = ndata_offset;
413 hammer_modify_buffer(trans, data_buffer, NULL, 0);
414 error = copyin(udata, ndata, mrec->leaf.data_len);
415 if (error == 0) {
416 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) {
417 kprintf("data crc mismatch on pipe\n");
418 error = EINVAL;
419 } else {
420 error = hammer_mirror_localize_data(
421 ndata, &mrec->leaf);
424 hammer_modify_buffer_done(data_buffer);
425 } else {
426 mrec->leaf.data_offset = 0;
427 error = 0;
428 ndata = NULL;
430 if (error)
431 goto failed;
434 * Do the insertion
436 cursor->flags |= HAMMER_CURSOR_INSERT;
437 error = hammer_btree_lookup(cursor);
438 if (error != ENOENT) {
439 if (error == 0)
440 error = EALREADY;
441 goto failed;
443 error = 0;
445 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop);
448 * Track a count of active inodes.
450 if (error == 0 && mrec->leaf.base.delete_tid == 0 &&
451 mrec->leaf.base.obj_type == HAMMER_RECTYPE_INODE) {
452 hammer_modify_volume_field(trans,
453 trans->rootvol,
454 vol0_stat_inodes);
455 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes;
456 hammer_modify_volume_done(trans->rootvol);
460 * vol0_next_tid must track the highest TID stored in the filesystem.
461 * We do not need to generate undo for this update.
463 high_tid = mrec->leaf.base.create_tid;
464 if (high_tid < mrec->leaf.base.delete_tid)
465 high_tid = mrec->leaf.base.delete_tid;
466 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) {
467 hammer_modify_volume(trans, trans->rootvol, NULL, 0);
468 trans->rootvol->ondisk->vol0_next_tid = high_tid;
469 hammer_modify_volume_done(trans->rootvol);
472 if (error == 0 && doprop)
473 hammer_btree_do_propagation(cursor, ip, &mrec->leaf);
475 failed:
477 * Cleanup
479 if (error && mrec->leaf.data_offset) {
480 hammer_blockmap_free(cursor->trans,
481 mrec->leaf.data_offset,
482 mrec->leaf.data_len);
484 if (data_buffer)
485 hammer_rel_buffer(data_buffer, 0);
486 return(error);
490 * Localize the data payload. Directory entries may need their
491 * localization adjusted.
493 * PFS directory entries must be skipped entirely (return EALREADY).
495 static
497 hammer_mirror_localize_data(hammer_data_ondisk_t data,
498 hammer_btree_leaf_elm_t leaf)
500 u_int32_t localization;
502 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
503 if (data->entry.obj_id == HAMMER_OBJID_ROOT)
504 return(EALREADY);
505 localization = leaf->base.localization &
506 HAMMER_LOCALIZE_PSEUDOFS_MASK;
507 if (data->entry.localization != localization) {
508 data->entry.localization = localization;
509 hammer_crc_set_leaf(data, leaf);
512 return(0);
516 * Set mirroring/pseudo-fs information
519 hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
520 struct hammer_ioc_pseudofs_rw *pfs)
522 hammer_pseudofs_inmem_t pfsm;
523 int error;
525 pfsm = ip->pfsm;
526 error = 0;
528 if (pfs->pseudoid != ip->obj_localization)
529 error = EINVAL;
530 if (pfs->bytes != sizeof(pfsm->pfsd))
531 error = EINVAL;
532 if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION)
533 error = EINVAL;
534 if (error == 0 && pfs->ondisk) {
535 if (ip->obj_id != HAMMER_OBJID_ROOT)
536 error = EINVAL;
537 if (error == 0) {
538 error = copyin(pfs->ondisk, &ip->pfsm->pfsd,
539 sizeof(ip->pfsm->pfsd));
541 if (error == 0)
542 error = hammer_save_pseudofs(trans, ip);
544 return(error);
548 * Get mirroring/pseudo-fs information
551 hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
552 struct hammer_ioc_pseudofs_rw *pfs)
554 hammer_pseudofs_inmem_t pfsm;
555 int error;
557 pfs->pseudoid = ip->obj_localization;
558 pfs->bytes = sizeof(struct hammer_pseudofs_data);
559 pfs->version = HAMMER_IOC_PSEUDOFS_VERSION;
562 * Update pfsm->sync_end_tid if a master
564 pfsm = ip->pfsm;
565 if (pfsm->pfsd.master_id >= 0)
566 pfsm->pfsd.sync_end_tid = trans->rootvol->ondisk->vol0_next_tid;
569 * Return PFS information for root inodes only.
571 error = 0;
572 if (pfs->ondisk) {
573 if (ip->obj_id != HAMMER_OBJID_ROOT)
574 error = EINVAL;
575 if (error == 0) {
576 error = copyout(&ip->pfsm->pfsd, pfs->ondisk,
577 sizeof(ip->pfsm->pfsd));
580 return(error);