2 * Copyright (c) 2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_mirror.c,v 1.9 2008/07/09 10:29:20 dillon Exp $
37 * HAMMER mirroring ioctls - serialize and deserialize modifications made
43 static int hammer_mirror_check(hammer_cursor_t cursor
,
44 struct hammer_ioc_mrecord
*mrec
);
45 static int hammer_mirror_update(hammer_cursor_t cursor
,
46 struct hammer_ioc_mrecord
*mrec
);
47 static int hammer_mirror_write(hammer_cursor_t cursor
,
48 struct hammer_ioc_mrecord
*mrec
,
49 hammer_inode_t ip
, char *udata
);
50 static int hammer_mirror_localize_data(hammer_data_ondisk_t data
,
51 hammer_btree_leaf_elm_t leaf
);
54 * All B-Tree records within the specified key range which also conform
55 * to the transaction id range are returned. Mirroring code keeps track
56 * of the last transaction id fully scanned and can efficiently pick up
57 * where it left off if interrupted.
59 * The PFS is identified in the mirror structure. The passed ip is just
60 * some directory in the overall HAMMER filesystem and has nothing to
64 hammer_ioc_mirror_read(hammer_transaction_t trans
, hammer_inode_t ip
,
65 struct hammer_ioc_mirror_rw
*mirror
)
67 struct hammer_cursor cursor
;
68 struct hammer_ioc_mrecord mrec
;
69 hammer_btree_leaf_elm_t elm
;
70 const int head_size
= HAMMER_MREC_HEADSIZE
;
71 const int crc_start
= HAMMER_MREC_CRCOFF
;
76 u_int32_t localization
;
78 localization
= (u_int32_t
)mirror
->pfs_id
<< 16;
80 if ((mirror
->key_beg
.localization
| mirror
->key_end
.localization
) &
81 HAMMER_LOCALIZE_PSEUDOFS_MASK
) {
84 if (hammer_btree_cmp(&mirror
->key_beg
, &mirror
->key_end
) > 0)
87 mirror
->key_cur
= mirror
->key_beg
;
88 mirror
->key_cur
.localization
+= localization
;
89 bzero(&mrec
, sizeof(mrec
));
92 error
= hammer_init_cursor(trans
, &cursor
, NULL
, NULL
);
94 hammer_done_cursor(&cursor
);
97 cursor
.key_beg
= mirror
->key_cur
;
98 cursor
.key_end
= mirror
->key_end
;
99 cursor
.key_end
.localization
+= localization
;
101 cursor
.flags
|= HAMMER_CURSOR_END_INCLUSIVE
;
102 cursor
.flags
|= HAMMER_CURSOR_BACKEND
;
105 * This flag filters the search to only return elements whos create
106 * or delete TID is >= mirror_tid. The B-Tree uses the mirror_tid
107 * field stored with internal and leaf nodes to shortcut the scan.
109 cursor
.flags
|= HAMMER_CURSOR_MIRROR_FILTERED
;
110 cursor
.mirror_tid
= mirror
->tid_beg
;
112 error
= hammer_btree_first(&cursor
);
115 * Leaf node. Only return elements modified in the range
116 * requested by userland.
118 KKASSERT(cursor
.node
->ondisk
->type
== HAMMER_BTREE_TYPE_LEAF
);
119 elm
= &cursor
.node
->ondisk
->elms
[cursor
.index
].leaf
;
121 if (elm
->base
.create_tid
< mirror
->tid_beg
||
122 elm
->base
.create_tid
>= mirror
->tid_end
) {
123 if (elm
->base
.delete_tid
< mirror
->tid_beg
||
124 elm
->base
.delete_tid
>= mirror
->tid_end
) {
129 mirror
->key_cur
= elm
->base
;
132 * Yield to more important tasks
134 if ((error
= hammer_signal_check(trans
->hmp
)) != 0)
136 if (trans
->hmp
->sync_lock
.wanted
) {
137 tsleep(trans
, 0, "hmrslo", hz
/ 10);
139 if (trans
->hmp
->locked_dirty_space
+
140 trans
->hmp
->io_running_space
> hammer_limit_dirtybufspace
) {
141 hammer_flusher_async(trans
->hmp
);
142 tsleep(trans
, 0, "hmrslo", hz
/ 10);
146 * The core code exports the data to userland.
148 data_len
= (elm
->data_offset
) ? elm
->data_len
: 0;
150 error
= hammer_btree_extract(&cursor
,
151 HAMMER_CURSOR_GET_DATA
);
155 bytes
= sizeof(struct hammer_ioc_mrecord
) + data_len
;
156 bytes
= (bytes
+ HAMMER_HEAD_ALIGN_MASK
) &
157 ~HAMMER_HEAD_ALIGN_MASK
;
158 if (mirror
->count
+ bytes
> mirror
->size
)
162 * Construct the record for userland and copyout.
164 * The user is asking for a snapshot, if the record was
165 * deleted beyond the user-requested ending tid, the record
166 * is not considered deleted from the point of view of
167 * userland and delete_tid is cleared.
169 mrec
.signature
= HAMMER_IOC_MIRROR_SIGNATURE
;
170 mrec
.type
= HAMMER_MREC_TYPE_REC
;
171 mrec
.rec_size
= bytes
;
173 if (elm
->base
.delete_tid
>= mirror
->tid_end
)
174 mrec
.leaf
.base
.delete_tid
= 0;
175 mrec
.rec_crc
= crc32(&mrec
.rec_size
, head_size
- crc_start
);
176 uptr
= (char *)mirror
->ubuf
+ mirror
->count
;
177 error
= copyout(&mrec
, uptr
, head_size
);
178 if (data_len
&& error
== 0) {
179 error
= copyout(cursor
.data
, uptr
+ head_size
,
183 mirror
->count
+= bytes
;
186 cursor
.flags
|= HAMMER_CURSOR_ATEDISK
;
187 error
= hammer_btree_iterate(&cursor
);
190 if (error
== ENOENT
) {
191 mirror
->key_cur
= mirror
->key_end
;
194 hammer_done_cursor(&cursor
);
195 if (error
== EDEADLK
)
197 if (error
== EINTR
) {
198 mirror
->head
.flags
|= HAMMER_IOC_HEAD_INTR
;
202 mirror
->key_cur
.localization
&= HAMMER_LOCALIZE_MASK
;
207 * Copy records from userland to the target mirror. Records which already
208 * exist may only have their delete_tid updated.
210 * The PFS is identified in the mirror structure. The passed ip is just
211 * some directory in the overall HAMMER filesystem and has nothing to
212 * do with the PFS. In fact, there might not even be a root directory for
216 hammer_ioc_mirror_write(hammer_transaction_t trans
, hammer_inode_t ip
,
217 struct hammer_ioc_mirror_rw
*mirror
)
219 struct hammer_cursor cursor
;
220 struct hammer_ioc_mrecord mrec
;
221 const int head_size
= HAMMER_MREC_HEADSIZE
;
222 const int crc_start
= HAMMER_MREC_CRCOFF
;
226 u_int32_t localization
;
228 localization
= (u_int32_t
)mirror
->pfs_id
<< 16;
230 if (mirror
->size
< 0 || mirror
->size
> 0x70000000)
233 error
= hammer_init_cursor(trans
, &cursor
, NULL
, NULL
);
235 hammer_normalize_cursor(&cursor
);
237 while (error
== 0 && mirror
->count
+ head_size
<= mirror
->size
) {
239 * Acquire and validate header
241 uptr
= (char *)mirror
->ubuf
+ mirror
->count
;
242 error
= copyin(uptr
, &mrec
, head_size
);
245 rec_crc
= crc32(&mrec
.rec_size
, head_size
- crc_start
);
246 if (mrec
.signature
!= HAMMER_IOC_MIRROR_SIGNATURE
) {
250 if (mrec
.type
!= HAMMER_MREC_TYPE_REC
) {
254 if (rec_crc
!= mrec
.rec_crc
) {
258 if (mrec
.rec_size
< head_size
||
259 mrec
.rec_size
> head_size
+ HAMMER_XBUFSIZE
+ 16 ||
260 mirror
->count
+ mrec
.rec_size
> mirror
->size
) {
264 if (mrec
.leaf
.data_len
< 0 ||
265 mrec
.leaf
.data_len
> HAMMER_XBUFSIZE
||
266 sizeof(struct hammer_ioc_mrecord
) + mrec
.leaf
.data_len
> mrec
.rec_size
) {
271 * Re-localize for target. relocalization of data is handled
272 * by hammer_mirror_write().
274 mrec
.leaf
.base
.localization
&= HAMMER_LOCALIZE_MASK
;
275 mrec
.leaf
.base
.localization
+= localization
;
280 * If the record exists only the delete_tid may be updated.
282 * If the record does not exist we create it. For now we
283 * ignore records with a non-zero delete_tid. Note that
284 * mirror operations are effective an as-of operation and
285 * delete_tid can be 0 for mirroring purposes even if it is
286 * not actually 0 at the originator.
288 hammer_normalize_cursor(&cursor
);
289 cursor
.key_beg
= mrec
.leaf
.base
;
290 cursor
.flags
|= HAMMER_CURSOR_BACKEND
;
291 cursor
.flags
&= ~HAMMER_CURSOR_INSERT
;
292 error
= hammer_btree_lookup(&cursor
);
294 if (error
== 0 && hammer_mirror_check(&cursor
, &mrec
)) {
295 hammer_sync_lock_sh(trans
);
296 error
= hammer_mirror_update(&cursor
, &mrec
);
297 hammer_sync_unlock(trans
);
298 } else if (error
== ENOENT
&& mrec
.leaf
.base
.delete_tid
== 0) {
299 hammer_sync_lock_sh(trans
);
300 error
= hammer_mirror_write(&cursor
, &mrec
, ip
,
302 hammer_sync_unlock(trans
);
303 } else if (error
== ENOENT
) {
308 * Clean for loop. It is ok if the record already exists
311 if (error
== EDEADLK
) {
312 hammer_done_cursor(&cursor
);
313 error
= hammer_init_cursor(trans
, &cursor
, NULL
, NULL
);
317 if (error
== EALREADY
)
320 mirror
->count
+= mrec
.rec_size
;
322 hammer_done_cursor(&cursor
);
327 * Check whether an update is needed in the case where a match already
328 * exists on the target. The only type of update allowed in this case
329 * is an update of the delete_tid.
331 * Return non-zero if the update should proceed.
335 hammer_mirror_check(hammer_cursor_t cursor
, struct hammer_ioc_mrecord
*mrec
)
337 hammer_btree_leaf_elm_t leaf
= cursor
->leaf
;
339 if (leaf
->base
.delete_tid
!= mrec
->leaf
.base
.delete_tid
) {
340 if (mrec
->leaf
.base
.delete_tid
!= 0)
347 * Update a record in-place. Only the delete_tid can change.
351 hammer_mirror_update(hammer_cursor_t cursor
, struct hammer_ioc_mrecord
*mrec
)
353 hammer_transaction_t trans
;
354 hammer_btree_leaf_elm_t elm
;
357 trans
= cursor
->trans
;
359 if (mrec
->leaf
.base
.delete_tid
== 0) {
360 kprintf("mirror_write: object %016llx:%016llx deleted on "
361 "target, not deleted on source\n",
362 elm
->base
.obj_id
, elm
->base
.key
);
366 KKASSERT(elm
->base
.create_tid
< mrec
->leaf
.base
.delete_tid
);
367 hammer_modify_node(trans
, cursor
->node
, elm
, sizeof(*elm
));
368 elm
->base
.delete_tid
= mrec
->leaf
.base
.delete_tid
;
369 elm
->delete_ts
= mrec
->leaf
.delete_ts
;
370 hammer_modify_node_done(cursor
->node
);
373 * Track a count of active inodes.
375 if (elm
->base
.obj_type
== HAMMER_RECTYPE_INODE
) {
376 hammer_modify_volume_field(trans
,
379 --trans
->hmp
->rootvol
->ondisk
->vol0_stat_inodes
;
380 hammer_modify_volume_done(trans
->rootvol
);
387 * Write out a new record.
391 hammer_mirror_write(hammer_cursor_t cursor
, struct hammer_ioc_mrecord
*mrec
,
392 hammer_inode_t ip
, char *udata
)
394 hammer_transaction_t trans
;
395 hammer_buffer_t data_buffer
;
396 hammer_off_t ndata_offset
;
397 hammer_tid_t high_tid
;
404 * removed: all records are now duplicated, including the root
407 if (mrec
->leaf
.base
.obj_id
== HAMMER_OBJID_ROOT
) {
408 if (mrec
->leaf
.base
.rec_type
== HAMMER_RECTYPE_INODE
||
409 mrec
->leaf
.base
.rec_type
== HAMMER_RECTYPE_FIX
) {
415 trans
= cursor
->trans
;
419 * Allocate and adjust data
421 if (mrec
->leaf
.data_len
&& mrec
->leaf
.data_offset
) {
422 ndata
= hammer_alloc_data(trans
, mrec
->leaf
.data_len
,
423 mrec
->leaf
.base
.rec_type
,
424 &ndata_offset
, &data_buffer
, &error
);
427 mrec
->leaf
.data_offset
= ndata_offset
;
428 hammer_modify_buffer(trans
, data_buffer
, NULL
, 0);
429 error
= copyin(udata
, ndata
, mrec
->leaf
.data_len
);
431 if (hammer_crc_test_leaf(ndata
, &mrec
->leaf
) == 0) {
432 kprintf("data crc mismatch on pipe\n");
435 error
= hammer_mirror_localize_data(
439 hammer_modify_buffer_done(data_buffer
);
441 mrec
->leaf
.data_offset
= 0;
451 cursor
->flags
|= HAMMER_CURSOR_INSERT
;
452 error
= hammer_btree_lookup(cursor
);
453 if (error
!= ENOENT
) {
460 error
= hammer_btree_insert(cursor
, &mrec
->leaf
, &doprop
);
463 * Track a count of active inodes.
465 if (error
== 0 && mrec
->leaf
.base
.delete_tid
== 0 &&
466 mrec
->leaf
.base
.obj_type
== HAMMER_RECTYPE_INODE
) {
467 hammer_modify_volume_field(trans
,
470 ++trans
->hmp
->rootvol
->ondisk
->vol0_stat_inodes
;
471 hammer_modify_volume_done(trans
->rootvol
);
475 * vol0_next_tid must track the highest TID stored in the filesystem.
476 * We do not need to generate undo for this update.
478 high_tid
= mrec
->leaf
.base
.create_tid
;
479 if (high_tid
< mrec
->leaf
.base
.delete_tid
)
480 high_tid
= mrec
->leaf
.base
.delete_tid
;
481 if (trans
->rootvol
->ondisk
->vol0_next_tid
< high_tid
) {
482 hammer_modify_volume(trans
, trans
->rootvol
, NULL
, 0);
483 trans
->rootvol
->ondisk
->vol0_next_tid
= high_tid
;
484 hammer_modify_volume_done(trans
->rootvol
);
487 if (error
== 0 && doprop
)
488 hammer_btree_do_propagation(cursor
, ip
, &mrec
->leaf
);
494 if (error
&& mrec
->leaf
.data_offset
) {
495 hammer_blockmap_free(cursor
->trans
,
496 mrec
->leaf
.data_offset
,
497 mrec
->leaf
.data_len
);
500 hammer_rel_buffer(data_buffer
, 0);
505 * Localize the data payload. Directory entries may need their
506 * localization adjusted.
508 * PFS directory entries must be skipped entirely (return EALREADY).
512 hammer_mirror_localize_data(hammer_data_ondisk_t data
,
513 hammer_btree_leaf_elm_t leaf
)
515 u_int32_t localization
;
517 if (leaf
->base
.rec_type
== HAMMER_RECTYPE_DIRENTRY
) {
518 if (data
->entry
.obj_id
== HAMMER_OBJID_ROOT
)
520 localization
= leaf
->base
.localization
&
521 HAMMER_LOCALIZE_PSEUDOFS_MASK
;
522 if (data
->entry
.localization
!= localization
) {
523 data
->entry
.localization
= localization
;
524 hammer_crc_set_leaf(data
, leaf
);
531 * Auto-detect the pseudofs.
535 hammer_mirror_autodetect(struct hammer_ioc_pseudofs_rw
*pfs
, hammer_inode_t ip
)
537 if (pfs
->pfs_id
== -1)
538 pfs
->pfs_id
= (int)(ip
->obj_localization
>> 16);
542 * Get mirroring/pseudo-fs information
545 hammer_ioc_get_pseudofs(hammer_transaction_t trans
, hammer_inode_t ip
,
546 struct hammer_ioc_pseudofs_rw
*pfs
)
548 hammer_pseudofs_inmem_t pfsm
;
549 u_int32_t localization
;
552 hammer_mirror_autodetect(pfs
, ip
);
553 if (pfs
->pfs_id
< 0 || pfs
->pfs_id
>= HAMMER_MAX_PFS
)
555 localization
= (u_int32_t
)pfs
->pfs_id
<< 16;
556 pfs
->bytes
= sizeof(struct hammer_pseudofs_data
);
557 pfs
->version
= HAMMER_IOC_PSEUDOFS_VERSION
;
559 pfsm
= hammer_load_pseudofs(trans
, localization
, &error
);
561 hammer_rel_pseudofs(trans
->hmp
, pfsm
);
566 * If the PFS is a master the sync tid is set by normal operation
567 * rather then the mirroring code, and will always track the
568 * real HAMMER filesystem.
570 if (pfsm
->pfsd
.master_id
>= 0)
571 pfsm
->pfsd
.sync_end_tid
= trans
->rootvol
->ondisk
->vol0_next_tid
;
574 * Copy out to userland.
577 if (pfs
->ondisk
&& error
== 0)
578 error
= copyout(&pfsm
->pfsd
, pfs
->ondisk
, sizeof(pfsm
->pfsd
));
579 hammer_rel_pseudofs(trans
->hmp
, pfsm
);
584 * Set mirroring/pseudo-fs information
587 hammer_ioc_set_pseudofs(hammer_transaction_t trans
, hammer_inode_t ip
,
588 struct ucred
*cred
, struct hammer_ioc_pseudofs_rw
*pfs
)
590 hammer_pseudofs_inmem_t pfsm
;
592 u_int32_t localization
;
595 hammer_mirror_autodetect(pfs
, ip
);
596 if (pfs
->pfs_id
< 0 || pfs
->pfs_id
>= HAMMER_MAX_PFS
)
598 if (pfs
->bytes
!= sizeof(pfsm
->pfsd
))
600 if (pfs
->version
!= HAMMER_IOC_PSEUDOFS_VERSION
)
602 if (error
== 0 && pfs
->ondisk
) {
604 * Load the PFS so we can modify our in-core copy.
606 localization
= (u_int32_t
)pfs
->pfs_id
<< 16;
607 pfsm
= hammer_load_pseudofs(trans
, localization
, &error
);
608 error
= copyin(pfs
->ondisk
, &pfsm
->pfsd
, sizeof(pfsm
->pfsd
));
611 * Save it back, create a root inode if we are in master
612 * mode and no root exists.
615 error
= hammer_mkroot_pseudofs(trans
, cred
, pfsm
);
617 error
= hammer_save_pseudofs(trans
, pfsm
);
618 hammer_rel_pseudofs(trans
->hmp
, pfsm
);