HAMMER 60I/Many: Mirroring
[dragonfly.git] / sys / vfs / hammer / hammer_mirror.c
blob6c08d3b82b45534ad3e03df8950e6d39c701e767
1 /*
2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.9 2008/07/09 10:29:20 dillon Exp $
37 * HAMMER mirroring ioctls - serialize and deserialize modifications made
38 * to a filesystem.
41 #include "hammer.h"
43 static int hammer_mirror_check(hammer_cursor_t cursor,
44 struct hammer_ioc_mrecord *mrec);
45 static int hammer_mirror_update(hammer_cursor_t cursor,
46 struct hammer_ioc_mrecord *mrec);
47 static int hammer_mirror_write(hammer_cursor_t cursor,
48 struct hammer_ioc_mrecord *mrec,
49 hammer_inode_t ip, char *udata);
50 static int hammer_mirror_localize_data(hammer_data_ondisk_t data,
51 hammer_btree_leaf_elm_t leaf);
54 * All B-Tree records within the specified key range which also conform
55 * to the transaction id range are returned. Mirroring code keeps track
56 * of the last transaction id fully scanned and can efficiently pick up
57 * where it left off if interrupted.
59 * The PFS is identified in the mirror structure. The passed ip is just
60 * some directory in the overall HAMMER filesystem and has nothing to
61 * do with the PFS.
63 int
64 hammer_ioc_mirror_read(hammer_transaction_t trans, hammer_inode_t ip,
65 struct hammer_ioc_mirror_rw *mirror)
67 struct hammer_cursor cursor;
68 struct hammer_ioc_mrecord mrec;
69 hammer_btree_leaf_elm_t elm;
70 const int head_size = HAMMER_MREC_HEADSIZE;
71 const int crc_start = HAMMER_MREC_CRCOFF;
72 char *uptr;
73 int error;
74 int data_len;
75 int bytes;
76 u_int32_t localization;
78 localization = (u_int32_t)mirror->pfs_id << 16;
80 if ((mirror->key_beg.localization | mirror->key_end.localization) &
81 HAMMER_LOCALIZE_PSEUDOFS_MASK) {
82 return(EINVAL);
84 if (hammer_btree_cmp(&mirror->key_beg, &mirror->key_end) > 0)
85 return(EINVAL);
87 mirror->key_cur = mirror->key_beg;
88 mirror->key_cur.localization += localization;
89 bzero(&mrec, sizeof(mrec));
91 retry:
92 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
93 if (error) {
94 hammer_done_cursor(&cursor);
95 goto failed;
97 cursor.key_beg = mirror->key_cur;
98 cursor.key_end = mirror->key_end;
99 cursor.key_end.localization += localization;
101 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
102 cursor.flags |= HAMMER_CURSOR_BACKEND;
105 * This flag filters the search to only return elements whos create
106 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid
107 * field stored with internal and leaf nodes to shortcut the scan.
109 cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
110 cursor.mirror_tid = mirror->tid_beg;
112 error = hammer_btree_first(&cursor);
113 while (error == 0) {
115 * Leaf node. Only return elements modified in the range
116 * requested by userland.
118 KKASSERT(cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
119 elm = &cursor.node->ondisk->elms[cursor.index].leaf;
121 if (elm->base.create_tid < mirror->tid_beg ||
122 elm->base.create_tid >= mirror->tid_end) {
123 if (elm->base.delete_tid < mirror->tid_beg ||
124 elm->base.delete_tid >= mirror->tid_end) {
125 goto skip;
129 mirror->key_cur = elm->base;
132 * Yield to more important tasks
134 if ((error = hammer_signal_check(trans->hmp)) != 0)
135 break;
136 if (trans->hmp->sync_lock.wanted) {
137 tsleep(trans, 0, "hmrslo", hz / 10);
139 if (trans->hmp->locked_dirty_space +
140 trans->hmp->io_running_space > hammer_limit_dirtybufspace) {
141 hammer_flusher_async(trans->hmp);
142 tsleep(trans, 0, "hmrslo", hz / 10);
146 * The core code exports the data to userland.
148 data_len = (elm->data_offset) ? elm->data_len : 0;
149 if (data_len) {
150 error = hammer_btree_extract(&cursor,
151 HAMMER_CURSOR_GET_DATA);
152 if (error)
153 break;
155 bytes = sizeof(struct hammer_ioc_mrecord) + data_len;
156 bytes = (bytes + HAMMER_HEAD_ALIGN_MASK) &
157 ~HAMMER_HEAD_ALIGN_MASK;
158 if (mirror->count + bytes > mirror->size)
159 break;
162 * Construct the record for userland and copyout.
164 * The user is asking for a snapshot, if the record was
165 * deleted beyond the user-requested ending tid, the record
166 * is not considered deleted from the point of view of
167 * userland and delete_tid is cleared.
169 mrec.signature = HAMMER_IOC_MIRROR_SIGNATURE;
170 mrec.type = HAMMER_MREC_TYPE_REC;
171 mrec.rec_size = bytes;
172 mrec.leaf = *elm;
173 if (elm->base.delete_tid >= mirror->tid_end)
174 mrec.leaf.base.delete_tid = 0;
175 mrec.rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
176 uptr = (char *)mirror->ubuf + mirror->count;
177 error = copyout(&mrec, uptr, head_size);
178 if (data_len && error == 0) {
179 error = copyout(cursor.data, uptr + head_size,
180 data_len);
182 if (error == 0)
183 mirror->count += bytes;
184 skip:
185 if (error == 0) {
186 cursor.flags |= HAMMER_CURSOR_ATEDISK;
187 error = hammer_btree_iterate(&cursor);
190 if (error == ENOENT) {
191 mirror->key_cur = mirror->key_end;
192 error = 0;
194 hammer_done_cursor(&cursor);
195 if (error == EDEADLK)
196 goto retry;
197 if (error == EINTR) {
198 mirror->head.flags |= HAMMER_IOC_HEAD_INTR;
199 error = 0;
201 failed:
202 mirror->key_cur.localization &= HAMMER_LOCALIZE_MASK;
203 return(error);
207 * Copy records from userland to the target mirror. Records which already
208 * exist may only have their delete_tid updated.
210 * The PFS is identified in the mirror structure. The passed ip is just
211 * some directory in the overall HAMMER filesystem and has nothing to
212 * do with the PFS. In fact, there might not even be a root directory for
213 * the PFS yet!
216 hammer_ioc_mirror_write(hammer_transaction_t trans, hammer_inode_t ip,
217 struct hammer_ioc_mirror_rw *mirror)
219 struct hammer_cursor cursor;
220 struct hammer_ioc_mrecord mrec;
221 const int head_size = HAMMER_MREC_HEADSIZE;
222 const int crc_start = HAMMER_MREC_CRCOFF;
223 u_int32_t rec_crc;
224 int error;
225 char *uptr;
226 u_int32_t localization;
228 localization = (u_int32_t)mirror->pfs_id << 16;
230 if (mirror->size < 0 || mirror->size > 0x70000000)
231 return(EINVAL);
233 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
234 retry:
235 hammer_normalize_cursor(&cursor);
237 while (error == 0 && mirror->count + head_size <= mirror->size) {
239 * Acquire and validate header
241 uptr = (char *)mirror->ubuf + mirror->count;
242 error = copyin(uptr, &mrec, head_size);
243 if (error)
244 break;
245 rec_crc = crc32(&mrec.rec_size, head_size - crc_start);
246 if (mrec.signature != HAMMER_IOC_MIRROR_SIGNATURE) {
247 error = EINVAL;
248 break;
250 if (mrec.type != HAMMER_MREC_TYPE_REC) {
251 error = EINVAL;
252 break;
254 if (rec_crc != mrec.rec_crc) {
255 error = EINVAL;
256 break;
258 if (mrec.rec_size < head_size ||
259 mrec.rec_size > head_size + HAMMER_XBUFSIZE + 16 ||
260 mirror->count + mrec.rec_size > mirror->size) {
261 error = EINVAL;
262 break;
264 if (mrec.leaf.data_len < 0 ||
265 mrec.leaf.data_len > HAMMER_XBUFSIZE ||
266 sizeof(struct hammer_ioc_mrecord) + mrec.leaf.data_len > mrec.rec_size) {
267 error = EINVAL;
271 * Re-localize for target. relocalization of data is handled
272 * by hammer_mirror_write().
274 mrec.leaf.base.localization &= HAMMER_LOCALIZE_MASK;
275 mrec.leaf.base.localization += localization;
278 * Locate the record.
280 * If the record exists only the delete_tid may be updated.
282 * If the record does not exist we create it. For now we
283 * ignore records with a non-zero delete_tid. Note that
284 * mirror operations are effective an as-of operation and
285 * delete_tid can be 0 for mirroring purposes even if it is
286 * not actually 0 at the originator.
288 hammer_normalize_cursor(&cursor);
289 cursor.key_beg = mrec.leaf.base;
290 cursor.flags |= HAMMER_CURSOR_BACKEND;
291 cursor.flags &= ~HAMMER_CURSOR_INSERT;
292 error = hammer_btree_lookup(&cursor);
294 if (error == 0 && hammer_mirror_check(&cursor, &mrec)) {
295 hammer_sync_lock_sh(trans);
296 error = hammer_mirror_update(&cursor, &mrec);
297 hammer_sync_unlock(trans);
298 } else if (error == ENOENT && mrec.leaf.base.delete_tid == 0) {
299 hammer_sync_lock_sh(trans);
300 error = hammer_mirror_write(&cursor, &mrec, ip,
301 uptr + head_size);
302 hammer_sync_unlock(trans);
303 } else if (error == ENOENT) {
304 error = 0;
308 * Clean for loop. It is ok if the record already exists
309 * on the target.
311 if (error == EDEADLK) {
312 hammer_done_cursor(&cursor);
313 error = hammer_init_cursor(trans, &cursor, NULL, NULL);
314 goto retry;
317 if (error == EALREADY)
318 error = 0;
319 if (error == 0)
320 mirror->count += mrec.rec_size;
322 hammer_done_cursor(&cursor);
323 return(0);
327 * Check whether an update is needed in the case where a match already
328 * exists on the target. The only type of update allowed in this case
329 * is an update of the delete_tid.
331 * Return non-zero if the update should proceed.
333 static
335 hammer_mirror_check(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
337 hammer_btree_leaf_elm_t leaf = cursor->leaf;
339 if (leaf->base.delete_tid != mrec->leaf.base.delete_tid) {
340 if (mrec->leaf.base.delete_tid != 0)
341 return(1);
343 return(0);
347 * Update a record in-place. Only the delete_tid can change.
349 static
351 hammer_mirror_update(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec)
353 hammer_transaction_t trans;
354 hammer_btree_leaf_elm_t elm;
356 elm = cursor->leaf;
357 trans = cursor->trans;
359 if (mrec->leaf.base.delete_tid == 0) {
360 kprintf("mirror_write: object %016llx:%016llx deleted on "
361 "target, not deleted on source\n",
362 elm->base.obj_id, elm->base.key);
363 return(0);
366 KKASSERT(elm->base.create_tid < mrec->leaf.base.delete_tid);
367 hammer_modify_node(trans, cursor->node, elm, sizeof(*elm));
368 elm->base.delete_tid = mrec->leaf.base.delete_tid;
369 elm->delete_ts = mrec->leaf.delete_ts;
370 hammer_modify_node_done(cursor->node);
373 * Track a count of active inodes.
375 if (elm->base.obj_type == HAMMER_RECTYPE_INODE) {
376 hammer_modify_volume_field(trans,
377 trans->rootvol,
378 vol0_stat_inodes);
379 --trans->hmp->rootvol->ondisk->vol0_stat_inodes;
380 hammer_modify_volume_done(trans->rootvol);
383 return(0);
387 * Write out a new record.
389 static
391 hammer_mirror_write(hammer_cursor_t cursor, struct hammer_ioc_mrecord *mrec,
392 hammer_inode_t ip, char *udata)
394 hammer_transaction_t trans;
395 hammer_buffer_t data_buffer;
396 hammer_off_t ndata_offset;
397 hammer_tid_t high_tid;
398 void *ndata;
399 int error;
400 int doprop;
402 #if 0
404 * removed: all records are now duplicated, including the root
405 * inode.
407 if (mrec->leaf.base.obj_id == HAMMER_OBJID_ROOT) {
408 if (mrec->leaf.base.rec_type == HAMMER_RECTYPE_INODE ||
409 mrec->leaf.base.rec_type == HAMMER_RECTYPE_FIX) {
410 return(0);
413 #endif
415 trans = cursor->trans;
416 data_buffer = NULL;
419 * Allocate and adjust data
421 if (mrec->leaf.data_len && mrec->leaf.data_offset) {
422 ndata = hammer_alloc_data(trans, mrec->leaf.data_len,
423 mrec->leaf.base.rec_type,
424 &ndata_offset, &data_buffer, &error);
425 if (ndata == NULL)
426 return(error);
427 mrec->leaf.data_offset = ndata_offset;
428 hammer_modify_buffer(trans, data_buffer, NULL, 0);
429 error = copyin(udata, ndata, mrec->leaf.data_len);
430 if (error == 0) {
431 if (hammer_crc_test_leaf(ndata, &mrec->leaf) == 0) {
432 kprintf("data crc mismatch on pipe\n");
433 error = EINVAL;
434 } else {
435 error = hammer_mirror_localize_data(
436 ndata, &mrec->leaf);
439 hammer_modify_buffer_done(data_buffer);
440 } else {
441 mrec->leaf.data_offset = 0;
442 error = 0;
443 ndata = NULL;
445 if (error)
446 goto failed;
449 * Do the insertion
451 cursor->flags |= HAMMER_CURSOR_INSERT;
452 error = hammer_btree_lookup(cursor);
453 if (error != ENOENT) {
454 if (error == 0)
455 error = EALREADY;
456 goto failed;
458 error = 0;
460 error = hammer_btree_insert(cursor, &mrec->leaf, &doprop);
463 * Track a count of active inodes.
465 if (error == 0 && mrec->leaf.base.delete_tid == 0 &&
466 mrec->leaf.base.obj_type == HAMMER_RECTYPE_INODE) {
467 hammer_modify_volume_field(trans,
468 trans->rootvol,
469 vol0_stat_inodes);
470 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes;
471 hammer_modify_volume_done(trans->rootvol);
475 * vol0_next_tid must track the highest TID stored in the filesystem.
476 * We do not need to generate undo for this update.
478 high_tid = mrec->leaf.base.create_tid;
479 if (high_tid < mrec->leaf.base.delete_tid)
480 high_tid = mrec->leaf.base.delete_tid;
481 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) {
482 hammer_modify_volume(trans, trans->rootvol, NULL, 0);
483 trans->rootvol->ondisk->vol0_next_tid = high_tid;
484 hammer_modify_volume_done(trans->rootvol);
487 if (error == 0 && doprop)
488 hammer_btree_do_propagation(cursor, ip, &mrec->leaf);
490 failed:
492 * Cleanup
494 if (error && mrec->leaf.data_offset) {
495 hammer_blockmap_free(cursor->trans,
496 mrec->leaf.data_offset,
497 mrec->leaf.data_len);
499 if (data_buffer)
500 hammer_rel_buffer(data_buffer, 0);
501 return(error);
505 * Localize the data payload. Directory entries may need their
506 * localization adjusted.
508 * PFS directory entries must be skipped entirely (return EALREADY).
510 static
512 hammer_mirror_localize_data(hammer_data_ondisk_t data,
513 hammer_btree_leaf_elm_t leaf)
515 u_int32_t localization;
517 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
518 if (data->entry.obj_id == HAMMER_OBJID_ROOT)
519 return(EALREADY);
520 localization = leaf->base.localization &
521 HAMMER_LOCALIZE_PSEUDOFS_MASK;
522 if (data->entry.localization != localization) {
523 data->entry.localization = localization;
524 hammer_crc_set_leaf(data, leaf);
527 return(0);
531 * Auto-detect the pseudofs.
533 static
534 void
535 hammer_mirror_autodetect(struct hammer_ioc_pseudofs_rw *pfs, hammer_inode_t ip)
537 if (pfs->pfs_id == -1)
538 pfs->pfs_id = (int)(ip->obj_localization >> 16);
542 * Get mirroring/pseudo-fs information
545 hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
546 struct hammer_ioc_pseudofs_rw *pfs)
548 hammer_pseudofs_inmem_t pfsm;
549 u_int32_t localization;
550 int error;
552 hammer_mirror_autodetect(pfs, ip);
553 if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS)
554 return(EINVAL);
555 localization = (u_int32_t)pfs->pfs_id << 16;
556 pfs->bytes = sizeof(struct hammer_pseudofs_data);
557 pfs->version = HAMMER_IOC_PSEUDOFS_VERSION;
559 pfsm = hammer_load_pseudofs(trans, localization, &error);
560 if (error) {
561 hammer_rel_pseudofs(trans->hmp, pfsm);
562 return(error);
566 * If the PFS is a master the sync tid is set by normal operation
567 * rather then the mirroring code, and will always track the
568 * real HAMMER filesystem.
570 if (pfsm->pfsd.master_id >= 0)
571 pfsm->pfsd.sync_end_tid = trans->rootvol->ondisk->vol0_next_tid;
574 * Copy out to userland.
576 error = 0;
577 if (pfs->ondisk && error == 0)
578 error = copyout(&pfsm->pfsd, pfs->ondisk, sizeof(pfsm->pfsd));
579 hammer_rel_pseudofs(trans->hmp, pfsm);
580 return(error);
584 * Set mirroring/pseudo-fs information
587 hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
588 struct ucred *cred, struct hammer_ioc_pseudofs_rw *pfs)
590 hammer_pseudofs_inmem_t pfsm;
591 int error;
592 u_int32_t localization;
594 error = 0;
595 hammer_mirror_autodetect(pfs, ip);
596 if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS)
597 error = EINVAL;
598 if (pfs->bytes != sizeof(pfsm->pfsd))
599 error = EINVAL;
600 if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION)
601 error = EINVAL;
602 if (error == 0 && pfs->ondisk) {
604 * Load the PFS so we can modify our in-core copy.
606 localization = (u_int32_t)pfs->pfs_id << 16;
607 pfsm = hammer_load_pseudofs(trans, localization, &error);
608 error = copyin(pfs->ondisk, &pfsm->pfsd, sizeof(pfsm->pfsd));
611 * Save it back, create a root inode if we are in master
612 * mode and no root exists.
614 if (error == 0)
615 error = hammer_mkroot_pseudofs(trans, cred, pfsm);
616 if (error == 0)
617 error = hammer_save_pseudofs(trans, pfsm);
618 hammer_rel_pseudofs(trans->hmp, pfsm);
620 return(error);