2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
38 #include <vm/vm_extern.h>
42 static int hammer_unload_inode(struct hammer_inode
*ip
);
43 static void hammer_free_inode(hammer_inode_t ip
);
44 static void hammer_flush_inode_core(hammer_inode_t ip
,
45 hammer_flush_group_t flg
, int flags
);
46 static int hammer_setup_child_callback(hammer_record_t rec
, void *data
);
48 static int hammer_syncgrp_child_callback(hammer_record_t rec
, void *data
);
50 static int hammer_setup_parent_inodes(hammer_inode_t ip
,
51 hammer_flush_group_t flg
);
52 static int hammer_setup_parent_inodes_helper(hammer_record_t record
,
53 hammer_flush_group_t flg
);
54 static void hammer_inode_wakereclaims(hammer_inode_t ip
, int dowake
);
57 extern struct hammer_inode
*HammerTruncIp
;
61 * RB-Tree support for inode structures
64 hammer_ino_rb_compare(hammer_inode_t ip1
, hammer_inode_t ip2
)
66 if (ip1
->obj_localization
< ip2
->obj_localization
)
68 if (ip1
->obj_localization
> ip2
->obj_localization
)
70 if (ip1
->obj_id
< ip2
->obj_id
)
72 if (ip1
->obj_id
> ip2
->obj_id
)
74 if (ip1
->obj_asof
< ip2
->obj_asof
)
76 if (ip1
->obj_asof
> ip2
->obj_asof
)
82 * RB-Tree support for inode structures / special LOOKUP_INFO
85 hammer_inode_info_cmp(hammer_inode_info_t info
, hammer_inode_t ip
)
87 if (info
->obj_localization
< ip
->obj_localization
)
89 if (info
->obj_localization
> ip
->obj_localization
)
91 if (info
->obj_id
< ip
->obj_id
)
93 if (info
->obj_id
> ip
->obj_id
)
95 if (info
->obj_asof
< ip
->obj_asof
)
97 if (info
->obj_asof
> ip
->obj_asof
)
103 * Used by hammer_scan_inode_snapshots() to locate all of an object's
104 * snapshots. Note that the asof field is not tested, which we can get
105 * away with because it is the lowest-priority field.
108 hammer_inode_info_cmp_all_history(hammer_inode_t ip
, void *data
)
110 hammer_inode_info_t info
= data
;
112 if (ip
->obj_localization
> info
->obj_localization
)
114 if (ip
->obj_localization
< info
->obj_localization
)
116 if (ip
->obj_id
> info
->obj_id
)
118 if (ip
->obj_id
< info
->obj_id
)
124 * Used by hammer_unload_pseudofs() to locate all inodes associated with
128 hammer_inode_pfs_cmp(hammer_inode_t ip
, void *data
)
130 u_int32_t localization
= *(u_int32_t
*)data
;
131 if (ip
->obj_localization
> localization
)
133 if (ip
->obj_localization
< localization
)
139 * RB-Tree support for pseudofs structures
142 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1
, hammer_pseudofs_inmem_t p2
)
144 if (p1
->localization
< p2
->localization
)
146 if (p1
->localization
> p2
->localization
)
152 RB_GENERATE(hammer_ino_rb_tree
, hammer_inode
, rb_node
, hammer_ino_rb_compare
);
153 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree
, INFO
, hammer_inode
, rb_node
,
154 hammer_inode_info_cmp
, hammer_inode_info_t
);
155 RB_GENERATE2(hammer_pfs_rb_tree
, hammer_pseudofs_inmem
, rb_node
,
156 hammer_pfs_rb_compare
, u_int32_t
, localization
);
159 * The kernel is not actively referencing this vnode but is still holding
162 * This is called from the frontend.
165 hammer_vop_inactive(struct vop_inactive_args
*ap
)
167 struct hammer_inode
*ip
= VTOI(ap
->a_vp
);
178 * If the inode no longer has visibility in the filesystem try to
179 * recycle it immediately, even if the inode is dirty. Recycling
180 * it quickly allows the system to reclaim buffer cache and VM
181 * resources which can matter a lot in a heavily loaded system.
183 * This can deadlock in vfsync() if we aren't careful.
185 * Do not queue the inode to the flusher if we still have visibility,
186 * otherwise namespace calls such as chmod will unnecessarily generate
187 * multiple inode updates.
189 hammer_inode_unloadable_check(ip
, 0);
190 if (ip
->ino_data
.nlinks
== 0) {
191 if (ip
->flags
& HAMMER_INODE_MODMASK
)
192 hammer_flush_inode(ip
, 0);
199 * Release the vnode association. This is typically (but not always)
200 * the last reference on the inode.
202 * Once the association is lost we are on our own with regards to
203 * flushing the inode.
206 hammer_vop_reclaim(struct vop_reclaim_args
*ap
)
208 struct hammer_inode
*ip
;
214 if ((ip
= vp
->v_data
) != NULL
) {
219 if ((ip
->flags
& HAMMER_INODE_RECLAIM
) == 0) {
220 ++hammer_count_reclaiming
;
221 ++hmp
->inode_reclaims
;
222 ip
->flags
|= HAMMER_INODE_RECLAIM
;
224 hammer_rel_inode(ip
, 1);
230 * Return a locked vnode for the specified inode. The inode must be
231 * referenced but NOT LOCKED on entry and will remain referenced on
234 * Called from the frontend.
237 hammer_get_vnode(struct hammer_inode
*ip
, struct vnode
**vpp
)
247 if ((vp
= ip
->vp
) == NULL
) {
248 error
= getnewvnode(VT_HAMMER
, hmp
->mp
, vpp
, 0, 0);
251 hammer_lock_ex(&ip
->lock
);
252 if (ip
->vp
!= NULL
) {
253 hammer_unlock(&ip
->lock
);
258 hammer_ref(&ip
->lock
);
262 obj_type
= ip
->ino_data
.obj_type
;
263 vp
->v_type
= hammer_get_vnode_type(obj_type
);
265 hammer_inode_wakereclaims(ip
, 0);
267 switch(ip
->ino_data
.obj_type
) {
268 case HAMMER_OBJTYPE_CDEV
:
269 case HAMMER_OBJTYPE_BDEV
:
270 vp
->v_ops
= &hmp
->mp
->mnt_vn_spec_ops
;
271 addaliasu(vp
, ip
->ino_data
.rmajor
,
272 ip
->ino_data
.rminor
);
274 case HAMMER_OBJTYPE_FIFO
:
275 vp
->v_ops
= &hmp
->mp
->mnt_vn_fifo_ops
;
282 * Only mark as the root vnode if the ip is not
283 * historical, otherwise the VFS cache will get
284 * confused. The other half of the special handling
285 * is in hammer_vop_nlookupdotdot().
287 * Pseudo-filesystem roots can be accessed via
288 * non-root filesystem paths and setting VROOT may
289 * confuse the namecache. Set VPFSROOT instead.
291 if (ip
->obj_id
== HAMMER_OBJID_ROOT
&&
292 ip
->obj_asof
== hmp
->asof
) {
293 if (ip
->obj_localization
== 0)
296 vp
->v_flag
|= VPFSROOT
;
299 vp
->v_data
= (void *)ip
;
300 /* vnode locked by getnewvnode() */
301 /* make related vnode dirty if inode dirty? */
302 hammer_unlock(&ip
->lock
);
303 if (vp
->v_type
== VREG
)
304 vinitvmio(vp
, ip
->ino_data
.size
);
309 * loop if the vget fails (aka races), or if the vp
310 * no longer matches ip->vp.
312 if (vget(vp
, LK_EXCLUSIVE
) == 0) {
323 * Locate all copies of the inode for obj_id compatible with the specified
324 * asof, reference, and issue the related call-back. This routine is used
325 * for direct-io invalidation and does not create any new inodes.
328 hammer_scan_inode_snapshots(hammer_mount_t hmp
, hammer_inode_info_t iinfo
,
329 int (*callback
)(hammer_inode_t ip
, void *data
),
332 hammer_ino_rb_tree_RB_SCAN(&hmp
->rb_inos_root
,
333 hammer_inode_info_cmp_all_history
,
338 * Acquire a HAMMER inode. The returned inode is not locked. These functions
339 * do not attach or detach the related vnode (use hammer_get_vnode() for
342 * The flags argument is only applied for newly created inodes, and only
343 * certain flags are inherited.
345 * Called from the frontend.
347 struct hammer_inode
*
348 hammer_get_inode(hammer_transaction_t trans
, hammer_inode_t dip
,
349 int64_t obj_id
, hammer_tid_t asof
, u_int32_t localization
,
350 int flags
, int *errorp
)
352 hammer_mount_t hmp
= trans
->hmp
;
353 struct hammer_inode_info iinfo
;
354 struct hammer_cursor cursor
;
355 struct hammer_inode
*ip
;
359 * Determine if we already have an inode cached. If we do then
362 * If we find an inode with no vnode we have to mark the
363 * transaction such that hammer_inode_waitreclaims() is
364 * called later on to avoid building up an infinite number
365 * of inodes. Otherwise we can continue to * add new inodes
366 * faster then they can be disposed of, even with the tsleep
369 iinfo
.obj_id
= obj_id
;
370 iinfo
.obj_asof
= asof
;
371 iinfo
.obj_localization
= localization
;
373 ip
= hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp
->rb_inos_root
, &iinfo
);
377 trans
->flags
|= HAMMER_TRANSF_NEWINODE
;
379 hammer_ref(&ip
->lock
);
385 * Allocate a new inode structure and deal with races later.
387 ip
= kmalloc(sizeof(*ip
), hmp
->m_inodes
, M_WAITOK
|M_ZERO
);
388 ++hammer_count_inodes
;
391 ip
->obj_asof
= iinfo
.obj_asof
;
392 ip
->obj_localization
= localization
;
394 ip
->flags
= flags
& HAMMER_INODE_RO
;
395 ip
->cache
[0].ip
= ip
;
396 ip
->cache
[1].ip
= ip
;
398 ip
->flags
|= HAMMER_INODE_RO
;
399 ip
->sync_trunc_off
= ip
->trunc_off
= ip
->save_trunc_off
=
400 0x7FFFFFFFFFFFFFFFLL
;
401 RB_INIT(&ip
->rec_tree
);
402 TAILQ_INIT(&ip
->target_list
);
403 hammer_ref(&ip
->lock
);
406 * Locate the on-disk inode. If this is a PFS root we always
407 * access the current version of the root inode and (if it is not
408 * a master) always access information under it with a snapshot
412 hammer_init_cursor(trans
, &cursor
, (dip
? &dip
->cache
[0] : NULL
), NULL
);
413 cursor
.key_beg
.localization
= localization
+ HAMMER_LOCALIZE_INODE
;
414 cursor
.key_beg
.obj_id
= ip
->obj_id
;
415 cursor
.key_beg
.key
= 0;
416 cursor
.key_beg
.create_tid
= 0;
417 cursor
.key_beg
.delete_tid
= 0;
418 cursor
.key_beg
.rec_type
= HAMMER_RECTYPE_INODE
;
419 cursor
.key_beg
.obj_type
= 0;
421 cursor
.asof
= iinfo
.obj_asof
;
422 cursor
.flags
= HAMMER_CURSOR_GET_LEAF
| HAMMER_CURSOR_GET_DATA
|
425 *errorp
= hammer_btree_lookup(&cursor
);
426 if (*errorp
== EDEADLK
) {
427 hammer_done_cursor(&cursor
);
432 * On success the B-Tree lookup will hold the appropriate
433 * buffer cache buffers and provide a pointer to the requested
434 * information. Copy the information to the in-memory inode
435 * and cache the B-Tree node to improve future operations.
438 ip
->ino_leaf
= cursor
.node
->ondisk
->elms
[cursor
.index
].leaf
;
439 ip
->ino_data
= cursor
.data
->inode
;
442 * cache[0] tries to cache the location of the object inode.
443 * The assumption is that it is near the directory inode.
445 * cache[1] tries to cache the location of the object data.
446 * The assumption is that it is near the directory data.
448 hammer_cache_node(&ip
->cache
[0], cursor
.node
);
449 if (dip
&& dip
->cache
[1].node
)
450 hammer_cache_node(&ip
->cache
[1], dip
->cache
[1].node
);
453 * The file should not contain any data past the file size
454 * stored in the inode. Setting save_trunc_off to the
455 * file size instead of max reduces B-Tree lookup overheads
456 * on append by allowing the flusher to avoid checking for
459 ip
->save_trunc_off
= ip
->ino_data
.size
;
462 * Locate and assign the pseudofs management structure to
465 if (dip
&& dip
->obj_localization
== ip
->obj_localization
) {
466 ip
->pfsm
= dip
->pfsm
;
467 hammer_ref(&ip
->pfsm
->lock
);
469 ip
->pfsm
= hammer_load_pseudofs(trans
,
470 ip
->obj_localization
,
472 *errorp
= 0; /* ignore ENOENT */
477 * The inode is placed on the red-black tree and will be synced to
478 * the media when flushed or by the filesystem sync. If this races
479 * another instantiation/lookup the insertion will fail.
482 if (RB_INSERT(hammer_ino_rb_tree
, &hmp
->rb_inos_root
, ip
)) {
483 hammer_free_inode(ip
);
484 hammer_done_cursor(&cursor
);
487 ip
->flags
|= HAMMER_INODE_ONDISK
;
489 if (ip
->flags
& HAMMER_INODE_RSV_INODES
) {
490 ip
->flags
&= ~HAMMER_INODE_RSV_INODES
; /* sanity */
494 hammer_free_inode(ip
);
497 hammer_done_cursor(&cursor
);
498 trans
->flags
|= HAMMER_TRANSF_NEWINODE
;
503 * Create a new filesystem object, returning the inode in *ipp. The
504 * returned inode will be referenced. The inode is created in-memory.
506 * If pfsm is non-NULL the caller wishes to create the root inode for
510 hammer_create_inode(hammer_transaction_t trans
, struct vattr
*vap
,
511 struct ucred
*cred
, hammer_inode_t dip
,
512 hammer_pseudofs_inmem_t pfsm
, struct hammer_inode
**ipp
)
521 ip
= kmalloc(sizeof(*ip
), hmp
->m_inodes
, M_WAITOK
|M_ZERO
);
522 ++hammer_count_inodes
;
524 trans
->flags
|= HAMMER_TRANSF_NEWINODE
;
527 KKASSERT(pfsm
->localization
!= 0);
528 ip
->obj_id
= HAMMER_OBJID_ROOT
;
529 ip
->obj_localization
= pfsm
->localization
;
531 KKASSERT(dip
!= NULL
);
532 ip
->obj_id
= hammer_alloc_objid(hmp
, dip
);
533 ip
->obj_localization
= dip
->obj_localization
;
536 KKASSERT(ip
->obj_id
!= 0);
537 ip
->obj_asof
= hmp
->asof
;
539 ip
->flush_state
= HAMMER_FST_IDLE
;
540 ip
->flags
= HAMMER_INODE_DDIRTY
|
541 HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
;
542 ip
->cache
[0].ip
= ip
;
543 ip
->cache
[1].ip
= ip
;
545 ip
->trunc_off
= 0x7FFFFFFFFFFFFFFFLL
;
546 /* ip->save_trunc_off = 0; (already zero) */
547 RB_INIT(&ip
->rec_tree
);
548 TAILQ_INIT(&ip
->target_list
);
550 ip
->ino_data
.atime
= trans
->time
;
551 ip
->ino_data
.mtime
= trans
->time
;
552 ip
->ino_data
.size
= 0;
553 ip
->ino_data
.nlinks
= 0;
556 * A nohistory designator on the parent directory is inherited by
557 * the child. We will do this even for pseudo-fs creation... the
558 * sysad can turn it off.
561 ip
->ino_data
.uflags
= dip
->ino_data
.uflags
&
562 (SF_NOHISTORY
|UF_NOHISTORY
|UF_NODUMP
);
565 ip
->ino_leaf
.base
.btype
= HAMMER_BTREE_TYPE_RECORD
;
566 ip
->ino_leaf
.base
.localization
= ip
->obj_localization
+
567 HAMMER_LOCALIZE_INODE
;
568 ip
->ino_leaf
.base
.obj_id
= ip
->obj_id
;
569 ip
->ino_leaf
.base
.key
= 0;
570 ip
->ino_leaf
.base
.create_tid
= 0;
571 ip
->ino_leaf
.base
.delete_tid
= 0;
572 ip
->ino_leaf
.base
.rec_type
= HAMMER_RECTYPE_INODE
;
573 ip
->ino_leaf
.base
.obj_type
= hammer_get_obj_type(vap
->va_type
);
575 ip
->ino_data
.obj_type
= ip
->ino_leaf
.base
.obj_type
;
576 ip
->ino_data
.version
= HAMMER_INODE_DATA_VERSION
;
577 ip
->ino_data
.mode
= vap
->va_mode
;
578 ip
->ino_data
.ctime
= trans
->time
;
581 * If we are running version 2 or greater we use dirhash algorithm #1
582 * which is semi-sorted. Algorithm #0 was just a pure crc.
584 if (trans
->hmp
->version
>= HAMMER_VOL_VERSION_TWO
) {
585 if (ip
->ino_leaf
.base
.obj_type
== HAMMER_OBJTYPE_DIRECTORY
) {
586 ip
->ino_data
.cap_flags
|= HAMMER_INODE_CAP_DIRHASH_ALG1
;
591 * Setup the ".." pointer. This only needs to be done for directories
592 * but we do it for all objects as a recovery aid.
595 ip
->ino_data
.parent_obj_id
= dip
->ino_leaf
.base
.obj_id
;
598 * The parent_obj_localization field only applies to pseudo-fs roots.
599 * XXX this is no longer applicable, PFSs are no longer directly
600 * tied into the parent's directory structure.
602 if (ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DIRECTORY
&&
603 ip
->obj_id
== HAMMER_OBJID_ROOT
) {
604 ip
->ino_data
.ext
.obj
.parent_obj_localization
=
605 dip
->obj_localization
;
609 switch(ip
->ino_leaf
.base
.obj_type
) {
610 case HAMMER_OBJTYPE_CDEV
:
611 case HAMMER_OBJTYPE_BDEV
:
612 ip
->ino_data
.rmajor
= vap
->va_rmajor
;
613 ip
->ino_data
.rminor
= vap
->va_rminor
;
620 * Calculate default uid/gid and overwrite with information from
624 xuid
= hammer_to_unix_xid(&dip
->ino_data
.uid
);
625 xuid
= vop_helper_create_uid(hmp
->mp
, dip
->ino_data
.mode
,
626 xuid
, cred
, &vap
->va_mode
);
630 ip
->ino_data
.mode
= vap
->va_mode
;
632 if (vap
->va_vaflags
& VA_UID_UUID_VALID
)
633 ip
->ino_data
.uid
= vap
->va_uid_uuid
;
634 else if (vap
->va_uid
!= (uid_t
)VNOVAL
)
635 hammer_guid_to_uuid(&ip
->ino_data
.uid
, vap
->va_uid
);
637 hammer_guid_to_uuid(&ip
->ino_data
.uid
, xuid
);
639 if (vap
->va_vaflags
& VA_GID_UUID_VALID
)
640 ip
->ino_data
.gid
= vap
->va_gid_uuid
;
641 else if (vap
->va_gid
!= (gid_t
)VNOVAL
)
642 hammer_guid_to_uuid(&ip
->ino_data
.gid
, vap
->va_gid
);
644 ip
->ino_data
.gid
= dip
->ino_data
.gid
;
646 hammer_ref(&ip
->lock
);
650 hammer_ref(&pfsm
->lock
);
652 } else if (dip
->obj_localization
== ip
->obj_localization
) {
653 ip
->pfsm
= dip
->pfsm
;
654 hammer_ref(&ip
->pfsm
->lock
);
657 ip
->pfsm
= hammer_load_pseudofs(trans
,
658 ip
->obj_localization
,
660 error
= 0; /* ignore ENOENT */
664 hammer_free_inode(ip
);
666 } else if (RB_INSERT(hammer_ino_rb_tree
, &hmp
->rb_inos_root
, ip
)) {
667 panic("hammer_create_inode: duplicate obj_id %llx", ip
->obj_id
);
669 hammer_free_inode(ip
);
676 * Final cleanup / freeing of an inode structure
679 hammer_free_inode(hammer_inode_t ip
)
681 struct hammer_mount
*hmp
;
684 KKASSERT(ip
->lock
.refs
== 1);
685 hammer_uncache_node(&ip
->cache
[0]);
686 hammer_uncache_node(&ip
->cache
[1]);
687 hammer_inode_wakereclaims(ip
, 1);
689 hammer_clear_objid(ip
);
690 --hammer_count_inodes
;
693 hammer_rel_pseudofs(hmp
, ip
->pfsm
);
696 kfree(ip
, hmp
->m_inodes
);
701 * Retrieve pseudo-fs data. NULL will never be returned.
703 * If an error occurs *errorp will be set and a default template is returned,
704 * otherwise *errorp is set to 0. Typically when an error occurs it will
707 hammer_pseudofs_inmem_t
708 hammer_load_pseudofs(hammer_transaction_t trans
,
709 u_int32_t localization
, int *errorp
)
711 hammer_mount_t hmp
= trans
->hmp
;
713 hammer_pseudofs_inmem_t pfsm
;
714 struct hammer_cursor cursor
;
718 pfsm
= RB_LOOKUP(hammer_pfs_rb_tree
, &hmp
->rb_pfsm_root
, localization
);
720 hammer_ref(&pfsm
->lock
);
726 * PFS records are stored in the root inode (not the PFS root inode,
727 * but the real root). Avoid an infinite recursion if loading
728 * the PFS for the real root.
731 ip
= hammer_get_inode(trans
, NULL
, HAMMER_OBJID_ROOT
,
733 HAMMER_DEF_LOCALIZATION
, 0, errorp
);
738 pfsm
= kmalloc(sizeof(*pfsm
), hmp
->m_misc
, M_WAITOK
| M_ZERO
);
739 pfsm
->localization
= localization
;
740 pfsm
->pfsd
.unique_uuid
= trans
->rootvol
->ondisk
->vol_fsid
;
741 pfsm
->pfsd
.shared_uuid
= pfsm
->pfsd
.unique_uuid
;
743 hammer_init_cursor(trans
, &cursor
, (ip
? &ip
->cache
[1] : NULL
), ip
);
744 cursor
.key_beg
.localization
= HAMMER_DEF_LOCALIZATION
+
745 HAMMER_LOCALIZE_MISC
;
746 cursor
.key_beg
.obj_id
= HAMMER_OBJID_ROOT
;
747 cursor
.key_beg
.create_tid
= 0;
748 cursor
.key_beg
.delete_tid
= 0;
749 cursor
.key_beg
.rec_type
= HAMMER_RECTYPE_PFS
;
750 cursor
.key_beg
.obj_type
= 0;
751 cursor
.key_beg
.key
= localization
;
752 cursor
.asof
= HAMMER_MAX_TID
;
753 cursor
.flags
|= HAMMER_CURSOR_ASOF
;
756 *errorp
= hammer_ip_lookup(&cursor
);
758 *errorp
= hammer_btree_lookup(&cursor
);
760 *errorp
= hammer_ip_resolve_data(&cursor
);
762 if (cursor
.data
->pfsd
.mirror_flags
&
763 HAMMER_PFSD_DELETED
) {
766 bytes
= cursor
.leaf
->data_len
;
767 if (bytes
> sizeof(pfsm
->pfsd
))
768 bytes
= sizeof(pfsm
->pfsd
);
769 bcopy(cursor
.data
, &pfsm
->pfsd
, bytes
);
773 hammer_done_cursor(&cursor
);
775 pfsm
->fsid_udev
= hammer_fsid_to_udev(&pfsm
->pfsd
.shared_uuid
);
776 hammer_ref(&pfsm
->lock
);
778 hammer_rel_inode(ip
, 0);
779 if (RB_INSERT(hammer_pfs_rb_tree
, &hmp
->rb_pfsm_root
, pfsm
)) {
780 kfree(pfsm
, hmp
->m_misc
);
787 * Store pseudo-fs data. The backend will automatically delete any prior
788 * on-disk pseudo-fs data but we have to delete in-memory versions.
791 hammer_save_pseudofs(hammer_transaction_t trans
, hammer_pseudofs_inmem_t pfsm
)
793 struct hammer_cursor cursor
;
794 hammer_record_t record
;
798 ip
= hammer_get_inode(trans
, NULL
, HAMMER_OBJID_ROOT
, HAMMER_MAX_TID
,
799 HAMMER_DEF_LOCALIZATION
, 0, &error
);
801 pfsm
->fsid_udev
= hammer_fsid_to_udev(&pfsm
->pfsd
.shared_uuid
);
802 hammer_init_cursor(trans
, &cursor
, &ip
->cache
[1], ip
);
803 cursor
.key_beg
.localization
= ip
->obj_localization
+
804 HAMMER_LOCALIZE_MISC
;
805 cursor
.key_beg
.obj_id
= HAMMER_OBJID_ROOT
;
806 cursor
.key_beg
.create_tid
= 0;
807 cursor
.key_beg
.delete_tid
= 0;
808 cursor
.key_beg
.rec_type
= HAMMER_RECTYPE_PFS
;
809 cursor
.key_beg
.obj_type
= 0;
810 cursor
.key_beg
.key
= pfsm
->localization
;
811 cursor
.asof
= HAMMER_MAX_TID
;
812 cursor
.flags
|= HAMMER_CURSOR_ASOF
;
814 error
= hammer_ip_lookup(&cursor
);
815 if (error
== 0 && hammer_cursor_inmem(&cursor
)) {
816 record
= cursor
.iprec
;
817 if (record
->flags
& HAMMER_RECF_INTERLOCK_BE
) {
818 KKASSERT(cursor
.deadlk_rec
== NULL
);
819 hammer_ref(&record
->lock
);
820 cursor
.deadlk_rec
= record
;
823 record
->flags
|= HAMMER_RECF_DELETED_FE
;
827 if (error
== 0 || error
== ENOENT
) {
828 record
= hammer_alloc_mem_record(ip
, sizeof(pfsm
->pfsd
));
829 record
->type
= HAMMER_MEM_RECORD_GENERAL
;
831 record
->leaf
.base
.localization
= ip
->obj_localization
+
832 HAMMER_LOCALIZE_MISC
;
833 record
->leaf
.base
.rec_type
= HAMMER_RECTYPE_PFS
;
834 record
->leaf
.base
.key
= pfsm
->localization
;
835 record
->leaf
.data_len
= sizeof(pfsm
->pfsd
);
836 bcopy(&pfsm
->pfsd
, record
->data
, sizeof(pfsm
->pfsd
));
837 error
= hammer_ip_add_record(trans
, record
);
839 hammer_done_cursor(&cursor
);
840 if (error
== EDEADLK
)
842 hammer_rel_inode(ip
, 0);
847 * Create a root directory for a PFS if one does not alredy exist.
849 * The PFS root stands alone so we must also bump the nlinks count
850 * to prevent it from being destroyed on release.
853 hammer_mkroot_pseudofs(hammer_transaction_t trans
, struct ucred
*cred
,
854 hammer_pseudofs_inmem_t pfsm
)
860 ip
= hammer_get_inode(trans
, NULL
, HAMMER_OBJID_ROOT
, HAMMER_MAX_TID
,
861 pfsm
->localization
, 0, &error
);
866 error
= hammer_create_inode(trans
, &vap
, cred
, NULL
, pfsm
, &ip
);
868 ++ip
->ino_data
.nlinks
;
869 hammer_modify_inode(ip
, HAMMER_INODE_DDIRTY
);
873 hammer_rel_inode(ip
, 0);
878 * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
879 * if we are unable to disassociate all the inodes.
883 hammer_unload_pseudofs_callback(hammer_inode_t ip
, void *data
)
887 hammer_ref(&ip
->lock
);
888 if (ip
->lock
.refs
== 2 && ip
->vp
)
889 vclean_unlocked(ip
->vp
);
890 if (ip
->lock
.refs
== 1 && ip
->vp
== NULL
)
893 res
= -1; /* stop, someone is using the inode */
894 hammer_rel_inode(ip
, 0);
899 hammer_unload_pseudofs(hammer_transaction_t trans
, u_int32_t localization
)
904 for (try = res
= 0; try < 4; ++try) {
905 res
= hammer_ino_rb_tree_RB_SCAN(&trans
->hmp
->rb_inos_root
,
906 hammer_inode_pfs_cmp
,
907 hammer_unload_pseudofs_callback
,
909 if (res
== 0 && try > 1)
911 hammer_flusher_sync(trans
->hmp
);
920 * Release a reference on a PFS
923 hammer_rel_pseudofs(hammer_mount_t hmp
, hammer_pseudofs_inmem_t pfsm
)
925 hammer_unref(&pfsm
->lock
);
926 if (pfsm
->lock
.refs
== 0) {
927 RB_REMOVE(hammer_pfs_rb_tree
, &hmp
->rb_pfsm_root
, pfsm
);
928 kfree(pfsm
, hmp
->m_misc
);
933 * Called by hammer_sync_inode().
936 hammer_update_inode(hammer_cursor_t cursor
, hammer_inode_t ip
)
938 hammer_transaction_t trans
= cursor
->trans
;
939 hammer_record_t record
;
947 * If the inode has a presence on-disk then locate it and mark
948 * it deleted, setting DELONDISK.
950 * The record may or may not be physically deleted, depending on
951 * the retention policy.
953 if ((ip
->flags
& (HAMMER_INODE_ONDISK
|HAMMER_INODE_DELONDISK
)) ==
954 HAMMER_INODE_ONDISK
) {
955 hammer_normalize_cursor(cursor
);
956 cursor
->key_beg
.localization
= ip
->obj_localization
+
957 HAMMER_LOCALIZE_INODE
;
958 cursor
->key_beg
.obj_id
= ip
->obj_id
;
959 cursor
->key_beg
.key
= 0;
960 cursor
->key_beg
.create_tid
= 0;
961 cursor
->key_beg
.delete_tid
= 0;
962 cursor
->key_beg
.rec_type
= HAMMER_RECTYPE_INODE
;
963 cursor
->key_beg
.obj_type
= 0;
964 cursor
->asof
= ip
->obj_asof
;
965 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
966 cursor
->flags
|= HAMMER_CURSOR_GET_LEAF
| HAMMER_CURSOR_ASOF
;
967 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
969 error
= hammer_btree_lookup(cursor
);
970 if (hammer_debug_inode
)
971 kprintf("IPDEL %p %08x %d", ip
, ip
->flags
, error
);
974 error
= hammer_ip_delete_record(cursor
, ip
, trans
->tid
);
975 if (hammer_debug_inode
)
976 kprintf(" error %d\n", error
);
978 ip
->flags
|= HAMMER_INODE_DELONDISK
;
981 hammer_cache_node(&ip
->cache
[0], cursor
->node
);
983 if (error
== EDEADLK
) {
984 hammer_done_cursor(cursor
);
985 error
= hammer_init_cursor(trans
, cursor
,
987 if (hammer_debug_inode
)
988 kprintf("IPDED %p %d\n", ip
, error
);
995 * Ok, write out the initial record or a new record (after deleting
996 * the old one), unless the DELETED flag is set. This routine will
997 * clear DELONDISK if it writes out a record.
999 * Update our inode statistics if this is the first application of
1000 * the inode on-disk.
1002 if (error
== 0 && (ip
->flags
& HAMMER_INODE_DELETED
) == 0) {
1004 * Generate a record and write it to the media. We clean-up
1005 * the state before releasing so we do not have to set-up
1008 record
= hammer_alloc_mem_record(ip
, 0);
1009 record
->type
= HAMMER_MEM_RECORD_INODE
;
1010 record
->flush_state
= HAMMER_FST_FLUSH
;
1011 record
->leaf
= ip
->sync_ino_leaf
;
1012 record
->leaf
.base
.create_tid
= trans
->tid
;
1013 record
->leaf
.data_len
= sizeof(ip
->sync_ino_data
);
1014 record
->leaf
.create_ts
= trans
->time32
;
1015 record
->data
= (void *)&ip
->sync_ino_data
;
1016 record
->flags
|= HAMMER_RECF_INTERLOCK_BE
;
1019 * If this flag is set we cannot sync the new file size
1020 * because we haven't finished related truncations. The
1021 * inode will be flushed in another flush group to finish
1024 if ((ip
->flags
& HAMMER_INODE_WOULDBLOCK
) &&
1025 ip
->sync_ino_data
.size
!= ip
->ino_data
.size
) {
1027 ip
->sync_ino_data
.size
= ip
->ino_data
.size
;
1033 error
= hammer_ip_sync_record_cursor(cursor
, record
);
1034 if (hammer_debug_inode
)
1035 kprintf("GENREC %p rec %08x %d\n",
1036 ip
, record
->flags
, error
);
1037 if (error
!= EDEADLK
)
1039 hammer_done_cursor(cursor
);
1040 error
= hammer_init_cursor(trans
, cursor
,
1042 if (hammer_debug_inode
)
1043 kprintf("GENREC reinit %d\n", error
);
1049 * The record isn't managed by the inode's record tree,
1050 * destroy it whether we succeed or fail.
1052 record
->flags
&= ~HAMMER_RECF_INTERLOCK_BE
;
1053 record
->flags
|= HAMMER_RECF_DELETED_FE
| HAMMER_RECF_COMMITTED
;
1054 record
->flush_state
= HAMMER_FST_IDLE
;
1055 hammer_rel_mem_record(record
);
1061 if (hammer_debug_inode
)
1062 kprintf("CLEANDELOND %p %08x\n", ip
, ip
->flags
);
1063 ip
->sync_flags
&= ~(HAMMER_INODE_DDIRTY
|
1064 HAMMER_INODE_ATIME
|
1065 HAMMER_INODE_MTIME
);
1066 ip
->flags
&= ~HAMMER_INODE_DELONDISK
;
1068 ip
->sync_flags
|= HAMMER_INODE_DDIRTY
;
1071 * Root volume count of inodes
1073 hammer_sync_lock_sh(trans
);
1074 if ((ip
->flags
& HAMMER_INODE_ONDISK
) == 0) {
1075 hammer_modify_volume_field(trans
,
1078 ++ip
->hmp
->rootvol
->ondisk
->vol0_stat_inodes
;
1079 hammer_modify_volume_done(trans
->rootvol
);
1080 ip
->flags
|= HAMMER_INODE_ONDISK
;
1081 if (hammer_debug_inode
)
1082 kprintf("NOWONDISK %p\n", ip
);
1084 hammer_sync_unlock(trans
);
1089 * If the inode has been destroyed, clean out any left-over flags
1090 * that may have been set by the frontend.
1092 if (error
== 0 && (ip
->flags
& HAMMER_INODE_DELETED
)) {
1093 ip
->sync_flags
&= ~(HAMMER_INODE_DDIRTY
|
1094 HAMMER_INODE_ATIME
|
1095 HAMMER_INODE_MTIME
);
1101 * Update only the itimes fields.
1103 * ATIME can be updated without generating any UNDO. MTIME is updated
1104 * with UNDO so it is guaranteed to be synchronized properly in case of
1107 * Neither field is included in the B-Tree leaf element's CRC, which is how
1108 * we can get away with updating ATIME the way we do.
1111 hammer_update_itimes(hammer_cursor_t cursor
, hammer_inode_t ip
)
1113 hammer_transaction_t trans
= cursor
->trans
;
1117 if ((ip
->flags
& (HAMMER_INODE_ONDISK
|HAMMER_INODE_DELONDISK
)) !=
1118 HAMMER_INODE_ONDISK
) {
1122 hammer_normalize_cursor(cursor
);
1123 cursor
->key_beg
.localization
= ip
->obj_localization
+
1124 HAMMER_LOCALIZE_INODE
;
1125 cursor
->key_beg
.obj_id
= ip
->obj_id
;
1126 cursor
->key_beg
.key
= 0;
1127 cursor
->key_beg
.create_tid
= 0;
1128 cursor
->key_beg
.delete_tid
= 0;
1129 cursor
->key_beg
.rec_type
= HAMMER_RECTYPE_INODE
;
1130 cursor
->key_beg
.obj_type
= 0;
1131 cursor
->asof
= ip
->obj_asof
;
1132 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
1133 cursor
->flags
|= HAMMER_CURSOR_ASOF
;
1134 cursor
->flags
|= HAMMER_CURSOR_GET_LEAF
;
1135 cursor
->flags
|= HAMMER_CURSOR_GET_DATA
;
1136 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
1138 error
= hammer_btree_lookup(cursor
);
1140 hammer_cache_node(&ip
->cache
[0], cursor
->node
);
1141 if (ip
->sync_flags
& HAMMER_INODE_MTIME
) {
1143 * Updating MTIME requires an UNDO. Just cover
1144 * both atime and mtime.
1146 hammer_sync_lock_sh(trans
);
1147 hammer_modify_buffer(trans
, cursor
->data_buffer
,
1148 HAMMER_ITIMES_BASE(&cursor
->data
->inode
),
1149 HAMMER_ITIMES_BYTES
);
1150 cursor
->data
->inode
.atime
= ip
->sync_ino_data
.atime
;
1151 cursor
->data
->inode
.mtime
= ip
->sync_ino_data
.mtime
;
1152 hammer_modify_buffer_done(cursor
->data_buffer
);
1153 hammer_sync_unlock(trans
);
1154 } else if (ip
->sync_flags
& HAMMER_INODE_ATIME
) {
1156 * Updating atime only can be done in-place with
1159 hammer_sync_lock_sh(trans
);
1160 hammer_modify_buffer(trans
, cursor
->data_buffer
,
1162 cursor
->data
->inode
.atime
= ip
->sync_ino_data
.atime
;
1163 hammer_modify_buffer_done(cursor
->data_buffer
);
1164 hammer_sync_unlock(trans
);
1166 ip
->sync_flags
&= ~(HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
);
1168 if (error
== EDEADLK
) {
1169 hammer_done_cursor(cursor
);
1170 error
= hammer_init_cursor(trans
, cursor
,
1179 * Release a reference on an inode, flush as requested.
1181 * On the last reference we queue the inode to the flusher for its final
1185 hammer_rel_inode(struct hammer_inode
*ip
, int flush
)
1187 /*hammer_mount_t hmp = ip->hmp;*/
1190 * Handle disposition when dropping the last ref.
1193 if (ip
->lock
.refs
== 1) {
1195 * Determine whether on-disk action is needed for
1196 * the inode's final disposition.
1198 KKASSERT(ip
->vp
== NULL
);
1199 hammer_inode_unloadable_check(ip
, 0);
1200 if (ip
->flags
& HAMMER_INODE_MODMASK
) {
1201 hammer_flush_inode(ip
, 0);
1202 } else if (ip
->lock
.refs
== 1) {
1203 hammer_unload_inode(ip
);
1208 hammer_flush_inode(ip
, 0);
1211 * The inode still has multiple refs, try to drop
1214 KKASSERT(ip
->lock
.refs
>= 1);
1215 if (ip
->lock
.refs
> 1) {
1216 hammer_unref(&ip
->lock
);
1224 * Unload and destroy the specified inode. Must be called with one remaining
1225 * reference. The reference is disposed of.
1227 * The inode must be completely clean.
1230 hammer_unload_inode(struct hammer_inode
*ip
)
1232 hammer_mount_t hmp
= ip
->hmp
;
1234 KASSERT(ip
->lock
.refs
== 1,
1235 ("hammer_unload_inode: %d refs\n", ip
->lock
.refs
));
1236 KKASSERT(ip
->vp
== NULL
);
1237 KKASSERT(ip
->flush_state
== HAMMER_FST_IDLE
);
1238 KKASSERT(ip
->cursor_ip_refs
== 0);
1239 KKASSERT(ip
->lock
.lockcount
== 0);
1240 KKASSERT((ip
->flags
& HAMMER_INODE_MODMASK
) == 0);
1242 KKASSERT(RB_EMPTY(&ip
->rec_tree
));
1243 KKASSERT(TAILQ_EMPTY(&ip
->target_list
));
1245 RB_REMOVE(hammer_ino_rb_tree
, &hmp
->rb_inos_root
, ip
);
1247 hammer_free_inode(ip
);
1252 * Called during unmounting if a critical error occured. The in-memory
1253 * inode and all related structures are destroyed.
1255 * If a critical error did not occur the unmount code calls the standard
1256 * release and asserts that the inode is gone.
1259 hammer_destroy_inode_callback(struct hammer_inode
*ip
, void *data __unused
)
1261 hammer_record_t rec
;
1264 * Get rid of the inodes in-memory records, regardless of their
1265 * state, and clear the mod-mask.
1267 while ((rec
= TAILQ_FIRST(&ip
->target_list
)) != NULL
) {
1268 TAILQ_REMOVE(&ip
->target_list
, rec
, target_entry
);
1269 rec
->target_ip
= NULL
;
1270 if (rec
->flush_state
== HAMMER_FST_SETUP
)
1271 rec
->flush_state
= HAMMER_FST_IDLE
;
1273 while ((rec
= RB_ROOT(&ip
->rec_tree
)) != NULL
) {
1274 if (rec
->flush_state
== HAMMER_FST_FLUSH
)
1275 --rec
->flush_group
->refs
;
1277 hammer_ref(&rec
->lock
);
1278 KKASSERT(rec
->lock
.refs
== 1);
1279 rec
->flush_state
= HAMMER_FST_IDLE
;
1280 rec
->flush_group
= NULL
;
1281 rec
->flags
|= HAMMER_RECF_DELETED_FE
;
1282 rec
->flags
|= HAMMER_RECF_DELETED_BE
;
1283 hammer_rel_mem_record(rec
);
1285 ip
->flags
&= ~HAMMER_INODE_MODMASK
;
1286 ip
->sync_flags
&= ~HAMMER_INODE_MODMASK
;
1287 KKASSERT(ip
->vp
== NULL
);
1290 * Remove the inode from any flush group, force it idle. FLUSH
1291 * and SETUP states have an inode ref.
1293 switch(ip
->flush_state
) {
1294 case HAMMER_FST_FLUSH
:
1295 TAILQ_REMOVE(&ip
->flush_group
->flush_list
, ip
, flush_entry
);
1296 --ip
->flush_group
->refs
;
1297 ip
->flush_group
= NULL
;
1299 case HAMMER_FST_SETUP
:
1300 hammer_unref(&ip
->lock
);
1301 ip
->flush_state
= HAMMER_FST_IDLE
;
1303 case HAMMER_FST_IDLE
:
1308 * There shouldn't be any associated vnode. The unload needs at
1309 * least one ref, if we do have a vp steal its ip ref.
1312 kprintf("hammer_destroy_inode_callback: Unexpected "
1313 "vnode association ip %p vp %p\n", ip
, ip
->vp
);
1314 ip
->vp
->v_data
= NULL
;
1317 hammer_ref(&ip
->lock
);
1319 hammer_unload_inode(ip
);
1324 * Called on mount -u when switching from RW to RO or vise-versa. Adjust
1325 * the read-only flag for cached inodes.
1327 * This routine is called from a RB_SCAN().
1330 hammer_reload_inode(hammer_inode_t ip
, void *arg __unused
)
1332 hammer_mount_t hmp
= ip
->hmp
;
1334 if (hmp
->ronly
|| hmp
->asof
!= HAMMER_MAX_TID
)
1335 ip
->flags
|= HAMMER_INODE_RO
;
1337 ip
->flags
&= ~HAMMER_INODE_RO
;
1342 * A transaction has modified an inode, requiring updates as specified by
1345 * HAMMER_INODE_DDIRTY: Inode data has been updated
1346 * HAMMER_INODE_XDIRTY: Dirty in-memory records
1347 * HAMMER_INODE_BUFS: Dirty buffer cache buffers
1348 * HAMMER_INODE_DELETED: Inode record/data must be deleted
1349 * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1352 hammer_modify_inode(hammer_inode_t ip
, int flags
)
1355 * ronly of 0 or 2 does not trigger assertion.
1356 * 2 is a special error state
1358 KKASSERT(ip
->hmp
->ronly
!= 1 ||
1359 (flags
& (HAMMER_INODE_DDIRTY
| HAMMER_INODE_XDIRTY
|
1360 HAMMER_INODE_BUFS
| HAMMER_INODE_DELETED
|
1361 HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
)) == 0);
1362 if ((ip
->flags
& HAMMER_INODE_RSV_INODES
) == 0) {
1363 ip
->flags
|= HAMMER_INODE_RSV_INODES
;
1364 ++ip
->hmp
->rsv_inodes
;
1371 * Request that an inode be flushed. This whole mess cannot block and may
1372 * recurse (if not synchronous). Once requested HAMMER will attempt to
1373 * actively flush the inode until the flush can be done.
1375 * The inode may already be flushing, or may be in a setup state. We can
1376 * place the inode in a flushing state if it is currently idle and flag it
1377 * to reflush if it is currently flushing.
1379 * Upon return if the inode could not be flushed due to a setup
1380 * dependancy, then it will be automatically flushed when the dependancy
1384 hammer_flush_inode(hammer_inode_t ip
, int flags
)
1387 hammer_flush_group_t flg
;
1391 * next_flush_group is the first flush group we can place the inode
1392 * in. It may be NULL. If it becomes full we append a new flush
1393 * group and make that the next_flush_group.
1396 while ((flg
= hmp
->next_flush_group
) != NULL
) {
1397 KKASSERT(flg
->running
== 0);
1398 if (flg
->total_count
+ flg
->refs
<= ip
->hmp
->undo_rec_limit
)
1400 hmp
->next_flush_group
= TAILQ_NEXT(flg
, flush_entry
);
1401 hammer_flusher_async(ip
->hmp
, flg
);
1404 flg
= kmalloc(sizeof(*flg
), hmp
->m_misc
, M_WAITOK
|M_ZERO
);
1405 hmp
->next_flush_group
= flg
;
1406 TAILQ_INIT(&flg
->flush_list
);
1407 TAILQ_INSERT_TAIL(&hmp
->flush_group_list
, flg
, flush_entry
);
1411 * Trivial 'nothing to flush' case. If the inode is in a SETUP
1412 * state we have to put it back into an IDLE state so we can
1413 * drop the extra ref.
1415 * If we have a parent dependancy we must still fall through
1418 if ((ip
->flags
& HAMMER_INODE_MODMASK
) == 0) {
1419 if (ip
->flush_state
== HAMMER_FST_SETUP
&&
1420 TAILQ_EMPTY(&ip
->target_list
)) {
1421 ip
->flush_state
= HAMMER_FST_IDLE
;
1422 hammer_rel_inode(ip
, 0);
1424 if (ip
->flush_state
== HAMMER_FST_IDLE
)
1429 * Our flush action will depend on the current state.
1431 switch(ip
->flush_state
) {
1432 case HAMMER_FST_IDLE
:
1434 * We have no dependancies and can flush immediately. Some
1435 * our children may not be flushable so we have to re-test
1436 * with that additional knowledge.
1438 hammer_flush_inode_core(ip
, flg
, flags
);
1440 case HAMMER_FST_SETUP
:
1442 * Recurse upwards through dependancies via target_list
1443 * and start their flusher actions going if possible.
1445 * 'good' is our connectivity. -1 means we have none and
1446 * can't flush, 0 means there weren't any dependancies, and
1447 * 1 means we have good connectivity.
1449 good
= hammer_setup_parent_inodes(ip
, flg
);
1453 * We can continue if good >= 0. Determine how
1454 * many records under our inode can be flushed (and
1457 hammer_flush_inode_core(ip
, flg
, flags
);
1460 * Parent has no connectivity, tell it to flush
1461 * us as soon as it does.
1463 * The REFLUSH flag is also needed to trigger
1464 * dependancy wakeups.
1466 ip
->flags
|= HAMMER_INODE_CONN_DOWN
|
1467 HAMMER_INODE_REFLUSH
;
1468 if (flags
& HAMMER_FLUSH_SIGNAL
) {
1469 ip
->flags
|= HAMMER_INODE_RESIGNAL
;
1470 hammer_flusher_async(ip
->hmp
, flg
);
1474 case HAMMER_FST_FLUSH
:
1476 * We are already flushing, flag the inode to reflush
1477 * if needed after it completes its current flush.
1479 * The REFLUSH flag is also needed to trigger
1480 * dependancy wakeups.
1482 if ((ip
->flags
& HAMMER_INODE_REFLUSH
) == 0)
1483 ip
->flags
|= HAMMER_INODE_REFLUSH
;
1484 if (flags
& HAMMER_FLUSH_SIGNAL
) {
1485 ip
->flags
|= HAMMER_INODE_RESIGNAL
;
1486 hammer_flusher_async(ip
->hmp
, flg
);
1493 * Scan ip->target_list, which is a list of records owned by PARENTS to our
1494 * ip which reference our ip.
1496 * XXX This is a huge mess of recursive code, but not one bit of it blocks
1497 * so for now do not ref/deref the structures. Note that if we use the
1498 * ref/rel code later, the rel CAN block.
1501 hammer_setup_parent_inodes(hammer_inode_t ip
, hammer_flush_group_t flg
)
1503 hammer_record_t depend
;
1508 TAILQ_FOREACH(depend
, &ip
->target_list
, target_entry
) {
1509 r
= hammer_setup_parent_inodes_helper(depend
, flg
);
1510 KKASSERT(depend
->target_ip
== ip
);
1511 if (r
< 0 && good
== 0)
1520 * This helper function takes a record representing the dependancy between
1521 * the parent inode and child inode.
1523 * record->ip = parent inode
1524 * record->target_ip = child inode
1526 * We are asked to recurse upwards and convert the record from SETUP
1527 * to FLUSH if possible.
1529 * Return 1 if the record gives us connectivity
1531 * Return 0 if the record is not relevant
1533 * Return -1 if we can't resolve the dependancy and there is no connectivity.
1536 hammer_setup_parent_inodes_helper(hammer_record_t record
,
1537 hammer_flush_group_t flg
)
1543 KKASSERT(record
->flush_state
!= HAMMER_FST_IDLE
);
1548 * If the record is already flushing, is it in our flush group?
1550 * If it is in our flush group but it is a general record or a
1551 * delete-on-disk, it does not improve our connectivity (return 0),
1552 * and if the target inode is not trying to destroy itself we can't
1553 * allow the operation yet anyway (the second return -1).
1555 if (record
->flush_state
== HAMMER_FST_FLUSH
) {
1557 * If not in our flush group ask the parent to reflush
1558 * us as soon as possible.
1560 if (record
->flush_group
!= flg
) {
1561 pip
->flags
|= HAMMER_INODE_REFLUSH
;
1562 record
->target_ip
->flags
|= HAMMER_INODE_CONN_DOWN
;
1567 * If in our flush group everything is already set up,
1568 * just return whether the record will improve our
1569 * visibility or not.
1571 if (record
->type
== HAMMER_MEM_RECORD_ADD
)
1577 * It must be a setup record. Try to resolve the setup dependancies
1578 * by recursing upwards so we can place ip on the flush list.
1580 KKASSERT(record
->flush_state
== HAMMER_FST_SETUP
);
1582 good
= hammer_setup_parent_inodes(pip
, flg
);
1585 * If good < 0 the parent has no connectivity and we cannot safely
1586 * flush the directory entry, which also means we can't flush our
1587 * ip. Flag the parent and us for downward recursion once the
1588 * parent's connectivity is resolved.
1591 /* pip->flags |= HAMMER_INODE_CONN_DOWN; set by recursion */
1592 record
->target_ip
->flags
|= HAMMER_INODE_CONN_DOWN
;
1597 * We are go, place the parent inode in a flushing state so we can
1598 * place its record in a flushing state. Note that the parent
1599 * may already be flushing. The record must be in the same flush
1600 * group as the parent.
1602 if (pip
->flush_state
!= HAMMER_FST_FLUSH
)
1603 hammer_flush_inode_core(pip
, flg
, HAMMER_FLUSH_RECURSION
);
1604 KKASSERT(pip
->flush_state
== HAMMER_FST_FLUSH
);
1605 KKASSERT(record
->flush_state
== HAMMER_FST_SETUP
);
1608 if (record
->type
== HAMMER_MEM_RECORD_DEL
&&
1609 (record
->target_ip
->flags
& (HAMMER_INODE_DELETED
|HAMMER_INODE_DELONDISK
)) == 0) {
1611 * Regardless of flushing state we cannot sync this path if the
1612 * record represents a delete-on-disk but the target inode
1613 * is not ready to sync its own deletion.
1615 * XXX need to count effective nlinks to determine whether
1616 * the flush is ok, otherwise removing a hardlink will
1617 * just leave the DEL record to rot.
1619 record
->target_ip
->flags
|= HAMMER_INODE_REFLUSH
;
1623 if (pip
->flush_group
== flg
) {
1625 * Because we have not calculated nlinks yet we can just
1626 * set records to the flush state if the parent is in
1627 * the same flush group as we are.
1629 record
->flush_state
= HAMMER_FST_FLUSH
;
1630 record
->flush_group
= flg
;
1631 ++record
->flush_group
->refs
;
1632 hammer_ref(&record
->lock
);
1635 * A general directory-add contributes to our visibility.
1637 * Otherwise it is probably a directory-delete or
1638 * delete-on-disk record and does not contribute to our
1639 * visbility (but we can still flush it).
1641 if (record
->type
== HAMMER_MEM_RECORD_ADD
)
1646 * If the parent is not in our flush group we cannot
1647 * flush this record yet, there is no visibility.
1648 * We tell the parent to reflush and mark ourselves
1649 * so the parent knows it should flush us too.
1651 pip
->flags
|= HAMMER_INODE_REFLUSH
;
1652 record
->target_ip
->flags
|= HAMMER_INODE_CONN_DOWN
;
1658 * This is the core routine placing an inode into the FST_FLUSH state.
1661 hammer_flush_inode_core(hammer_inode_t ip
, hammer_flush_group_t flg
, int flags
)
1666 * Set flush state and prevent the flusher from cycling into
1667 * the next flush group. Do not place the ip on the list yet.
1668 * Inodes not in the idle state get an extra reference.
1670 KKASSERT(ip
->flush_state
!= HAMMER_FST_FLUSH
);
1671 if (ip
->flush_state
== HAMMER_FST_IDLE
)
1672 hammer_ref(&ip
->lock
);
1673 ip
->flush_state
= HAMMER_FST_FLUSH
;
1674 ip
->flush_group
= flg
;
1675 ++ip
->hmp
->flusher
.group_lock
;
1676 ++ip
->hmp
->count_iqueued
;
1677 ++hammer_count_iqueued
;
1681 * If the flush group reaches the autoflush limit we want to signal
1682 * the flusher. This is particularly important for remove()s.
1684 if (flg
->total_count
== hammer_autoflush
)
1685 flags
|= HAMMER_FLUSH_SIGNAL
;
1688 * We need to be able to vfsync/truncate from the backend.
1690 KKASSERT((ip
->flags
& HAMMER_INODE_VHELD
) == 0);
1691 if (ip
->vp
&& (ip
->vp
->v_flag
& VINACTIVE
) == 0) {
1692 ip
->flags
|= HAMMER_INODE_VHELD
;
1697 * Figure out how many in-memory records we can actually flush
1698 * (not including inode meta-data, buffers, etc).
1700 KKASSERT((ip
->flags
& HAMMER_INODE_WOULDBLOCK
) == 0);
1701 if (flags
& HAMMER_FLUSH_RECURSION
) {
1703 * If this is a upwards recursion we do not want to
1704 * recurse down again!
1708 } else if (ip
->flags
& HAMMER_INODE_WOULDBLOCK
) {
1710 * No new records are added if we must complete a flush
1711 * from a previous cycle, but we do have to move the records
1712 * from the previous cycle to the current one.
1715 go_count
= RB_SCAN(hammer_rec_rb_tree
, &ip
->rec_tree
, NULL
,
1716 hammer_syncgrp_child_callback
, NULL
);
1722 * Normal flush, scan records and bring them into the flush.
1723 * Directory adds and deletes are usually skipped (they are
1724 * grouped with the related inode rather then with the
1727 * go_count can be negative, which means the scan aborted
1728 * due to the flush group being over-full and we should
1729 * flush what we have.
1731 go_count
= RB_SCAN(hammer_rec_rb_tree
, &ip
->rec_tree
, NULL
,
1732 hammer_setup_child_callback
, NULL
);
1736 * This is a more involved test that includes go_count. If we
1737 * can't flush, flag the inode and return. If go_count is 0 we
1738 * were are unable to flush any records in our rec_tree and
1739 * must ignore the XDIRTY flag.
1741 if (go_count
== 0) {
1742 if ((ip
->flags
& HAMMER_INODE_MODMASK_NOXDIRTY
) == 0) {
1743 --ip
->hmp
->count_iqueued
;
1744 --hammer_count_iqueued
;
1747 ip
->flush_state
= HAMMER_FST_SETUP
;
1748 ip
->flush_group
= NULL
;
1749 if (ip
->flags
& HAMMER_INODE_VHELD
) {
1750 ip
->flags
&= ~HAMMER_INODE_VHELD
;
1755 * REFLUSH is needed to trigger dependancy wakeups
1756 * when an inode is in SETUP.
1758 ip
->flags
|= HAMMER_INODE_REFLUSH
;
1759 if (flags
& HAMMER_FLUSH_SIGNAL
) {
1760 ip
->flags
|= HAMMER_INODE_RESIGNAL
;
1761 hammer_flusher_async(ip
->hmp
, flg
);
1763 if (--ip
->hmp
->flusher
.group_lock
== 0)
1764 wakeup(&ip
->hmp
->flusher
.group_lock
);
1770 * Snapshot the state of the inode for the backend flusher.
1772 * We continue to retain save_trunc_off even when all truncations
1773 * have been resolved as an optimization to determine if we can
1774 * skip the B-Tree lookup for overwrite deletions.
1776 * NOTE: The DELETING flag is a mod flag, but it is also sticky,
1777 * and stays in ip->flags. Once set, it stays set until the
1778 * inode is destroyed.
1780 if (ip
->flags
& HAMMER_INODE_TRUNCATED
) {
1781 KKASSERT((ip
->sync_flags
& HAMMER_INODE_TRUNCATED
) == 0);
1782 ip
->sync_trunc_off
= ip
->trunc_off
;
1783 ip
->trunc_off
= 0x7FFFFFFFFFFFFFFFLL
;
1784 ip
->flags
&= ~HAMMER_INODE_TRUNCATED
;
1785 ip
->sync_flags
|= HAMMER_INODE_TRUNCATED
;
1788 * The save_trunc_off used to cache whether the B-Tree
1789 * holds any records past that point is not used until
1790 * after the truncation has succeeded, so we can safely
1793 if (ip
->save_trunc_off
> ip
->sync_trunc_off
)
1794 ip
->save_trunc_off
= ip
->sync_trunc_off
;
1796 ip
->sync_flags
|= (ip
->flags
& HAMMER_INODE_MODMASK
&
1797 ~HAMMER_INODE_TRUNCATED
);
1798 ip
->sync_ino_leaf
= ip
->ino_leaf
;
1799 ip
->sync_ino_data
= ip
->ino_data
;
1800 ip
->flags
&= ~HAMMER_INODE_MODMASK
| HAMMER_INODE_TRUNCATED
;
1801 #ifdef DEBUG_TRUNCATE
1802 if ((ip
->sync_flags
& HAMMER_INODE_TRUNCATED
) && ip
== HammerTruncIp
)
1803 kprintf("truncateS %016llx\n", ip
->sync_trunc_off
);
1807 * The flusher list inherits our inode and reference.
1809 KKASSERT(flg
->running
== 0);
1810 TAILQ_INSERT_TAIL(&flg
->flush_list
, ip
, flush_entry
);
1811 if (--ip
->hmp
->flusher
.group_lock
== 0)
1812 wakeup(&ip
->hmp
->flusher
.group_lock
);
1814 if (flags
& HAMMER_FLUSH_SIGNAL
) {
1815 hammer_flusher_async(ip
->hmp
, flg
);
1820 * Callback for scan of ip->rec_tree. Try to include each record in our
1821 * flush. ip->flush_group has been set but the inode has not yet been
1822 * moved into a flushing state.
1824 * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
1827 * We return 1 for any record placed or found in FST_FLUSH, which prevents
1828 * the caller from shortcutting the flush.
1831 hammer_setup_child_callback(hammer_record_t rec
, void *data
)
1833 hammer_flush_group_t flg
;
1834 hammer_inode_t target_ip
;
1839 * Deleted records are ignored. Note that the flush detects deleted
1840 * front-end records at multiple points to deal with races. This is
1841 * just the first line of defense. The only time DELETED_FE cannot
1842 * be set is when HAMMER_RECF_INTERLOCK_BE is set.
1844 * Don't get confused between record deletion and, say, directory
1845 * entry deletion. The deletion of a directory entry that is on
1846 * the media has nothing to do with the record deletion flags.
1848 if (rec
->flags
& (HAMMER_RECF_DELETED_FE
|HAMMER_RECF_DELETED_BE
)) {
1849 if (rec
->flush_state
== HAMMER_FST_FLUSH
) {
1850 KKASSERT(rec
->flush_group
== rec
->ip
->flush_group
);
1859 * If the record is in an idle state it has no dependancies and
1863 flg
= ip
->flush_group
;
1866 switch(rec
->flush_state
) {
1867 case HAMMER_FST_IDLE
:
1869 * The record has no setup dependancy, we can flush it.
1871 KKASSERT(rec
->target_ip
== NULL
);
1872 rec
->flush_state
= HAMMER_FST_FLUSH
;
1873 rec
->flush_group
= flg
;
1875 hammer_ref(&rec
->lock
);
1878 case HAMMER_FST_SETUP
:
1880 * The record has a setup dependancy. These are typically
1881 * directory entry adds and deletes. Such entries will be
1882 * flushed when their inodes are flushed so we do not
1883 * usually have to add them to the flush here. However,
1884 * if the target_ip has set HAMMER_INODE_CONN_DOWN then
1885 * it is asking us to flush this record (and it).
1887 target_ip
= rec
->target_ip
;
1888 KKASSERT(target_ip
!= NULL
);
1889 KKASSERT(target_ip
->flush_state
!= HAMMER_FST_IDLE
);
1892 * If the target IP is already flushing in our group
1893 * we could associate the record, but target_ip has
1894 * already synced ino_data to sync_ino_data and we
1895 * would also have to adjust nlinks. Plus there are
1896 * ordering issues for adds and deletes.
1898 * Reflush downward if this is an ADD, and upward if
1901 if (target_ip
->flush_state
== HAMMER_FST_FLUSH
) {
1902 if (rec
->flush_state
== HAMMER_MEM_RECORD_ADD
)
1903 ip
->flags
|= HAMMER_INODE_REFLUSH
;
1905 target_ip
->flags
|= HAMMER_INODE_REFLUSH
;
1910 * Target IP is not yet flushing. This can get complex
1911 * because we have to be careful about the recursion.
1913 * Directories create an issue for us in that if a flush
1914 * of a directory is requested the expectation is to flush
1915 * any pending directory entries, but this will cause the
1916 * related inodes to recursively flush as well. We can't
1917 * really defer the operation so just get as many as we
1921 if ((target_ip
->flags
& HAMMER_INODE_RECLAIM
) == 0 &&
1922 (target_ip
->flags
& HAMMER_INODE_CONN_DOWN
) == 0) {
1924 * We aren't reclaiming and the target ip was not
1925 * previously prevented from flushing due to this
1926 * record dependancy. Do not flush this record.
1931 if (flg
->total_count
+ flg
->refs
>
1932 ip
->hmp
->undo_rec_limit
) {
1934 * Our flush group is over-full and we risk blowing
1935 * out the UNDO FIFO. Stop the scan, flush what we
1936 * have, then reflush the directory.
1938 * The directory may be forced through multiple
1939 * flush groups before it can be completely
1942 ip
->flags
|= HAMMER_INODE_RESIGNAL
|
1943 HAMMER_INODE_REFLUSH
;
1945 } else if (rec
->type
== HAMMER_MEM_RECORD_ADD
) {
1947 * If the target IP is not flushing we can force
1948 * it to flush, even if it is unable to write out
1949 * any of its own records we have at least one in
1950 * hand that we CAN deal with.
1952 rec
->flush_state
= HAMMER_FST_FLUSH
;
1953 rec
->flush_group
= flg
;
1955 hammer_ref(&rec
->lock
);
1956 hammer_flush_inode_core(target_ip
, flg
,
1957 HAMMER_FLUSH_RECURSION
);
1961 * General or delete-on-disk record.
1963 * XXX this needs help. If a delete-on-disk we could
1964 * disconnect the target. If the target has its own
1965 * dependancies they really need to be flushed.
1969 rec
->flush_state
= HAMMER_FST_FLUSH
;
1970 rec
->flush_group
= flg
;
1972 hammer_ref(&rec
->lock
);
1973 hammer_flush_inode_core(target_ip
, flg
,
1974 HAMMER_FLUSH_RECURSION
);
1978 case HAMMER_FST_FLUSH
:
1980 * The flush_group should already match.
1982 KKASSERT(rec
->flush_group
== flg
);
1991 * This version just moves records already in a flush state to the new
1992 * flush group and that is it.
1995 hammer_syncgrp_child_callback(hammer_record_t rec
, void *data
)
1997 hammer_inode_t ip
= rec
->ip
;
1999 switch(rec
->flush_state
) {
2000 case HAMMER_FST_FLUSH
:
2001 KKASSERT(rec
->flush_group
== ip
->flush_group
);
2011 * Wait for a previously queued flush to complete.
2013 * If a critical error occured we don't try to wait.
2016 hammer_wait_inode(hammer_inode_t ip
)
2018 hammer_flush_group_t flg
;
2021 if ((ip
->hmp
->flags
& HAMMER_MOUNT_CRITICAL_ERROR
) == 0) {
2022 while (ip
->flush_state
!= HAMMER_FST_IDLE
&&
2023 (ip
->hmp
->flags
& HAMMER_MOUNT_CRITICAL_ERROR
) == 0) {
2024 if (ip
->flush_state
== HAMMER_FST_SETUP
)
2025 hammer_flush_inode(ip
, HAMMER_FLUSH_SIGNAL
);
2026 if (ip
->flush_state
!= HAMMER_FST_IDLE
) {
2027 ip
->flags
|= HAMMER_INODE_FLUSHW
;
2028 tsleep(&ip
->flags
, 0, "hmrwin", 0);
2035 * Called by the backend code when a flush has been completed.
2036 * The inode has already been removed from the flush list.
2038 * A pipelined flush can occur, in which case we must re-enter the
2039 * inode on the list and re-copy its fields.
2042 hammer_flush_inode_done(hammer_inode_t ip
, int error
)
2047 KKASSERT(ip
->flush_state
== HAMMER_FST_FLUSH
);
2052 * Auto-reflush if the backend could not completely flush
2053 * the inode. This fixes a case where a deferred buffer flush
2054 * could cause fsync to return early.
2056 if (ip
->sync_flags
& HAMMER_INODE_MODMASK
)
2057 ip
->flags
|= HAMMER_INODE_REFLUSH
;
2060 * Merge left-over flags back into the frontend and fix the state.
2061 * Incomplete truncations are retained by the backend.
2064 ip
->flags
|= ip
->sync_flags
& ~HAMMER_INODE_TRUNCATED
;
2065 ip
->sync_flags
&= HAMMER_INODE_TRUNCATED
;
2068 * The backend may have adjusted nlinks, so if the adjusted nlinks
2069 * does not match the fronttend set the frontend's RDIRTY flag again.
2071 if (ip
->ino_data
.nlinks
!= ip
->sync_ino_data
.nlinks
)
2072 ip
->flags
|= HAMMER_INODE_DDIRTY
;
2075 * Fix up the dirty buffer status.
2077 if (ip
->vp
&& RB_ROOT(&ip
->vp
->v_rbdirty_tree
)) {
2078 ip
->flags
|= HAMMER_INODE_BUFS
;
2082 * Re-set the XDIRTY flag if some of the inode's in-memory records
2083 * could not be flushed.
2085 KKASSERT((RB_EMPTY(&ip
->rec_tree
) &&
2086 (ip
->flags
& HAMMER_INODE_XDIRTY
) == 0) ||
2087 (!RB_EMPTY(&ip
->rec_tree
) &&
2088 (ip
->flags
& HAMMER_INODE_XDIRTY
) != 0));
2091 * Do not lose track of inodes which no longer have vnode
2092 * assocations, otherwise they may never get flushed again.
2094 * The reflush flag can be set superfluously, causing extra pain
2095 * for no reason. If the inode is no longer modified it no longer
2096 * needs to be flushed.
2098 if (ip
->flags
& HAMMER_INODE_MODMASK
) {
2100 ip
->flags
|= HAMMER_INODE_REFLUSH
;
2102 ip
->flags
&= ~HAMMER_INODE_REFLUSH
;
2106 * Adjust the flush state.
2108 if (ip
->flags
& HAMMER_INODE_WOULDBLOCK
) {
2110 * We were unable to flush out all our records, leave the
2111 * inode in a flush state and in the current flush group.
2112 * The flush group will be re-run.
2114 * This occurs if the UNDO block gets too full or there is
2115 * too much dirty meta-data and allows the flusher to
2116 * finalize the UNDO block and then re-flush.
2118 ip
->flags
&= ~HAMMER_INODE_WOULDBLOCK
;
2122 * Remove from the flush_group
2124 TAILQ_REMOVE(&ip
->flush_group
->flush_list
, ip
, flush_entry
);
2125 ip
->flush_group
= NULL
;
2128 * Clean up the vnode ref and tracking counts.
2130 if (ip
->flags
& HAMMER_INODE_VHELD
) {
2131 ip
->flags
&= ~HAMMER_INODE_VHELD
;
2134 --hmp
->count_iqueued
;
2135 --hammer_count_iqueued
;
2138 * And adjust the state.
2140 if (TAILQ_EMPTY(&ip
->target_list
) && RB_EMPTY(&ip
->rec_tree
)) {
2141 ip
->flush_state
= HAMMER_FST_IDLE
;
2144 ip
->flush_state
= HAMMER_FST_SETUP
;
2149 * If the frontend is waiting for a flush to complete,
2152 if (ip
->flags
& HAMMER_INODE_FLUSHW
) {
2153 ip
->flags
&= ~HAMMER_INODE_FLUSHW
;
2158 * If the frontend made more changes and requested another
2159 * flush, then try to get it running.
2161 * Reflushes are aborted when the inode is errored out.
2163 if (ip
->flags
& HAMMER_INODE_REFLUSH
) {
2164 ip
->flags
&= ~HAMMER_INODE_REFLUSH
;
2165 if (ip
->flags
& HAMMER_INODE_RESIGNAL
) {
2166 ip
->flags
&= ~HAMMER_INODE_RESIGNAL
;
2167 hammer_flush_inode(ip
, HAMMER_FLUSH_SIGNAL
);
2169 hammer_flush_inode(ip
, 0);
2175 * If we have no parent dependancies we can clear CONN_DOWN
2177 if (TAILQ_EMPTY(&ip
->target_list
))
2178 ip
->flags
&= ~HAMMER_INODE_CONN_DOWN
;
2181 * If the inode is now clean drop the space reservation.
2183 if ((ip
->flags
& HAMMER_INODE_MODMASK
) == 0 &&
2184 (ip
->flags
& HAMMER_INODE_RSV_INODES
)) {
2185 ip
->flags
&= ~HAMMER_INODE_RSV_INODES
;
2190 hammer_rel_inode(ip
, 0);
2194 * Called from hammer_sync_inode() to synchronize in-memory records
2198 hammer_sync_record_callback(hammer_record_t record
, void *data
)
2200 hammer_cursor_t cursor
= data
;
2201 hammer_transaction_t trans
= cursor
->trans
;
2202 hammer_mount_t hmp
= trans
->hmp
;
2206 * Skip records that do not belong to the current flush.
2208 ++hammer_stats_record_iterations
;
2209 if (record
->flush_state
!= HAMMER_FST_FLUSH
)
2213 if (record
->flush_group
!= record
->ip
->flush_group
) {
2214 kprintf("sync_record %p ip %p bad flush group %p %p\n", record
, record
->ip
, record
->flush_group
,record
->ip
->flush_group
);
2219 KKASSERT(record
->flush_group
== record
->ip
->flush_group
);
2222 * Interlock the record using the BE flag. Once BE is set the
2223 * frontend cannot change the state of FE.
2225 * NOTE: If FE is set prior to us setting BE we still sync the
2226 * record out, but the flush completion code converts it to
2227 * a delete-on-disk record instead of destroying it.
2229 KKASSERT((record
->flags
& HAMMER_RECF_INTERLOCK_BE
) == 0);
2230 record
->flags
|= HAMMER_RECF_INTERLOCK_BE
;
2233 * The backend may have already disposed of the record.
2235 if (record
->flags
& HAMMER_RECF_DELETED_BE
) {
2241 * If the whole inode is being deleting all on-disk records will
2242 * be deleted very soon, we can't sync any new records to disk
2243 * because they will be deleted in the same transaction they were
2244 * created in (delete_tid == create_tid), which will assert.
2246 * XXX There may be a case with RECORD_ADD with DELETED_FE set
2247 * that we currently panic on.
2249 if (record
->ip
->sync_flags
& HAMMER_INODE_DELETING
) {
2250 switch(record
->type
) {
2251 case HAMMER_MEM_RECORD_DATA
:
2253 * We don't have to do anything, if the record was
2254 * committed the space will have been accounted for
2258 case HAMMER_MEM_RECORD_GENERAL
:
2259 record
->flags
|= HAMMER_RECF_DELETED_FE
;
2260 record
->flags
|= HAMMER_RECF_DELETED_BE
;
2263 case HAMMER_MEM_RECORD_ADD
:
2264 panic("hammer_sync_record_callback: illegal add "
2265 "during inode deletion record %p", record
);
2266 break; /* NOT REACHED */
2267 case HAMMER_MEM_RECORD_INODE
:
2268 panic("hammer_sync_record_callback: attempt to "
2269 "sync inode record %p?", record
);
2270 break; /* NOT REACHED */
2271 case HAMMER_MEM_RECORD_DEL
:
2273 * Follow through and issue the on-disk deletion
2280 * If DELETED_FE is set special handling is needed for directory
2281 * entries. Dependant pieces related to the directory entry may
2282 * have already been synced to disk. If this occurs we have to
2283 * sync the directory entry and then change the in-memory record
2284 * from an ADD to a DELETE to cover the fact that it's been
2285 * deleted by the frontend.
2287 * A directory delete covering record (MEM_RECORD_DEL) can never
2288 * be deleted by the frontend.
2290 * Any other record type (aka DATA) can be deleted by the frontend.
2291 * XXX At the moment the flusher must skip it because there may
2292 * be another data record in the flush group for the same block,
2293 * meaning that some frontend data changes can leak into the backend's
2294 * synchronization point.
2296 if (record
->flags
& HAMMER_RECF_DELETED_FE
) {
2297 if (record
->type
== HAMMER_MEM_RECORD_ADD
) {
2298 record
->flags
|= HAMMER_RECF_CONVERT_DELETE
;
2300 KKASSERT(record
->type
!= HAMMER_MEM_RECORD_DEL
);
2301 record
->flags
|= HAMMER_RECF_DELETED_BE
;
2308 * Assign the create_tid for new records. Deletions already
2309 * have the record's entire key properly set up.
2311 if (record
->type
!= HAMMER_MEM_RECORD_DEL
)
2312 record
->leaf
.base
.create_tid
= trans
->tid
;
2313 record
->leaf
.create_ts
= trans
->time32
;
2315 error
= hammer_ip_sync_record_cursor(cursor
, record
);
2316 if (error
!= EDEADLK
)
2318 hammer_done_cursor(cursor
);
2319 error
= hammer_init_cursor(trans
, cursor
, &record
->ip
->cache
[0],
2324 record
->flags
&= ~HAMMER_RECF_CONVERT_DELETE
;
2329 hammer_flush_record_done(record
, error
);
2332 * Do partial finalization if we have built up too many dirty
2333 * buffers. Otherwise a buffer cache deadlock can occur when
2334 * doing things like creating tens of thousands of tiny files.
2336 * We must release our cursor lock to avoid a 3-way deadlock
2337 * due to the exclusive sync lock the finalizer must get.
2339 if (hammer_flusher_meta_limit(hmp
)) {
2340 hammer_unlock_cursor(cursor
, 0);
2341 hammer_flusher_finalize(trans
, 0);
2342 hammer_lock_cursor(cursor
, 0);
2349 * Backend function called by the flusher to sync an inode to media.
2352 hammer_sync_inode(hammer_transaction_t trans
, hammer_inode_t ip
)
2354 struct hammer_cursor cursor
;
2355 hammer_node_t tmp_node
;
2356 hammer_record_t depend
;
2357 hammer_record_t next
;
2358 int error
, tmp_error
;
2361 if ((ip
->sync_flags
& HAMMER_INODE_MODMASK
) == 0)
2364 error
= hammer_init_cursor(trans
, &cursor
, &ip
->cache
[1], ip
);
2369 * Any directory records referencing this inode which are not in
2370 * our current flush group must adjust our nlink count for the
2371 * purposes of synchronization to disk.
2373 * Records which are in our flush group can be unlinked from our
2374 * inode now, potentially allowing the inode to be physically
2377 * This cannot block.
2379 nlinks
= ip
->ino_data
.nlinks
;
2380 next
= TAILQ_FIRST(&ip
->target_list
);
2381 while ((depend
= next
) != NULL
) {
2382 next
= TAILQ_NEXT(depend
, target_entry
);
2383 if (depend
->flush_state
== HAMMER_FST_FLUSH
&&
2384 depend
->flush_group
== ip
->flush_group
) {
2386 * If this is an ADD that was deleted by the frontend
2387 * the frontend nlinks count will have already been
2388 * decremented, but the backend is going to sync its
2389 * directory entry and must account for it. The
2390 * record will be converted to a delete-on-disk when
2393 * If the ADD was not deleted by the frontend we
2394 * can remove the dependancy from our target_list.
2396 if (depend
->flags
& HAMMER_RECF_DELETED_FE
) {
2399 TAILQ_REMOVE(&ip
->target_list
, depend
,
2401 depend
->target_ip
= NULL
;
2403 } else if ((depend
->flags
& HAMMER_RECF_DELETED_FE
) == 0) {
2405 * Not part of our flush group
2407 KKASSERT((depend
->flags
& HAMMER_RECF_DELETED_BE
) == 0);
2408 switch(depend
->type
) {
2409 case HAMMER_MEM_RECORD_ADD
:
2412 case HAMMER_MEM_RECORD_DEL
:
2422 * Set dirty if we had to modify the link count.
2424 if (ip
->sync_ino_data
.nlinks
!= nlinks
) {
2425 KKASSERT((int64_t)nlinks
>= 0);
2426 ip
->sync_ino_data
.nlinks
= nlinks
;
2427 ip
->sync_flags
|= HAMMER_INODE_DDIRTY
;
2431 * If there is a trunction queued destroy any data past the (aligned)
2432 * truncation point. Userland will have dealt with the buffer
2433 * containing the truncation point for us.
2435 * We don't flush pending frontend data buffers until after we've
2436 * dealt with the truncation.
2438 if (ip
->sync_flags
& HAMMER_INODE_TRUNCATED
) {
2440 * Interlock trunc_off. The VOP front-end may continue to
2441 * make adjustments to it while we are blocked.
2444 off_t aligned_trunc_off
;
2447 trunc_off
= ip
->sync_trunc_off
;
2448 blkmask
= hammer_blocksize(trunc_off
) - 1;
2449 aligned_trunc_off
= (trunc_off
+ blkmask
) & ~(int64_t)blkmask
;
2452 * Delete any whole blocks on-media. The front-end has
2453 * already cleaned out any partial block and made it
2454 * pending. The front-end may have updated trunc_off
2455 * while we were blocked so we only use sync_trunc_off.
2457 * This operation can blow out the buffer cache, EWOULDBLOCK
2458 * means we were unable to complete the deletion. The
2459 * deletion will update sync_trunc_off in that case.
2461 error
= hammer_ip_delete_range(&cursor
, ip
,
2463 0x7FFFFFFFFFFFFFFFLL
, 2);
2464 if (error
== EWOULDBLOCK
) {
2465 ip
->flags
|= HAMMER_INODE_WOULDBLOCK
;
2467 goto defer_buffer_flush
;
2474 * Clear the truncation flag on the backend after we have
2475 * complete the deletions. Backend data is now good again
2476 * (including new records we are about to sync, below).
2478 * Leave sync_trunc_off intact. As we write additional
2479 * records the backend will update sync_trunc_off. This
2480 * tells the backend whether it can skip the overwrite
2481 * test. This should work properly even when the backend
2482 * writes full blocks where the truncation point straddles
2483 * the block because the comparison is against the base
2484 * offset of the record.
2486 ip
->sync_flags
&= ~HAMMER_INODE_TRUNCATED
;
2487 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
2493 * Now sync related records. These will typically be directory
2494 * entries, records tracking direct-writes, or delete-on-disk records.
2497 tmp_error
= RB_SCAN(hammer_rec_rb_tree
, &ip
->rec_tree
, NULL
,
2498 hammer_sync_record_callback
, &cursor
);
2504 hammer_cache_node(&ip
->cache
[1], cursor
.node
);
2507 * Re-seek for inode update, assuming our cache hasn't been ripped
2508 * out from under us.
2511 tmp_node
= hammer_ref_node_safe(ip
->hmp
, &ip
->cache
[0], &error
);
2513 hammer_cursor_downgrade(&cursor
);
2514 hammer_lock_sh(&tmp_node
->lock
);
2515 if ((tmp_node
->flags
& HAMMER_NODE_DELETED
) == 0)
2516 hammer_cursor_seek(&cursor
, tmp_node
, 0);
2517 hammer_unlock(&tmp_node
->lock
);
2518 hammer_rel_node(tmp_node
);
2524 * If we are deleting the inode the frontend had better not have
2525 * any active references on elements making up the inode.
2527 * The call to hammer_ip_delete_clean() cleans up auxillary records
2528 * but not DB or DATA records. Those must have already been deleted
2529 * by the normal truncation mechanic.
2531 if (error
== 0 && ip
->sync_ino_data
.nlinks
== 0 &&
2532 RB_EMPTY(&ip
->rec_tree
) &&
2533 (ip
->sync_flags
& HAMMER_INODE_DELETING
) &&
2534 (ip
->flags
& HAMMER_INODE_DELETED
) == 0) {
2537 error
= hammer_ip_delete_clean(&cursor
, ip
, &count1
);
2539 ip
->flags
|= HAMMER_INODE_DELETED
;
2540 ip
->sync_flags
&= ~HAMMER_INODE_DELETING
;
2541 ip
->sync_flags
&= ~HAMMER_INODE_TRUNCATED
;
2542 KKASSERT(RB_EMPTY(&ip
->rec_tree
));
2545 * Set delete_tid in both the frontend and backend
2546 * copy of the inode record. The DELETED flag handles
2547 * this, do not set RDIRTY.
2549 ip
->ino_leaf
.base
.delete_tid
= trans
->tid
;
2550 ip
->sync_ino_leaf
.base
.delete_tid
= trans
->tid
;
2551 ip
->ino_leaf
.delete_ts
= trans
->time32
;
2552 ip
->sync_ino_leaf
.delete_ts
= trans
->time32
;
2556 * Adjust the inode count in the volume header
2558 hammer_sync_lock_sh(trans
);
2559 if (ip
->flags
& HAMMER_INODE_ONDISK
) {
2560 hammer_modify_volume_field(trans
,
2563 --ip
->hmp
->rootvol
->ondisk
->vol0_stat_inodes
;
2564 hammer_modify_volume_done(trans
->rootvol
);
2566 hammer_sync_unlock(trans
);
2572 ip
->sync_flags
&= ~HAMMER_INODE_BUFS
;
2576 * Now update the inode's on-disk inode-data and/or on-disk record.
2577 * DELETED and ONDISK are managed only in ip->flags.
2579 * In the case of a defered buffer flush we still update the on-disk
2580 * inode to satisfy visibility requirements if there happen to be
2581 * directory dependancies.
2583 switch(ip
->flags
& (HAMMER_INODE_DELETED
| HAMMER_INODE_ONDISK
)) {
2584 case HAMMER_INODE_DELETED
|HAMMER_INODE_ONDISK
:
2586 * If deleted and on-disk, don't set any additional flags.
2587 * the delete flag takes care of things.
2589 * Clear flags which may have been set by the frontend.
2591 ip
->sync_flags
&= ~(HAMMER_INODE_DDIRTY
| HAMMER_INODE_XDIRTY
|
2592 HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
|
2593 HAMMER_INODE_DELETING
);
2595 case HAMMER_INODE_DELETED
:
2597 * Take care of the case where a deleted inode was never
2598 * flushed to the disk in the first place.
2600 * Clear flags which may have been set by the frontend.
2602 ip
->sync_flags
&= ~(HAMMER_INODE_DDIRTY
| HAMMER_INODE_XDIRTY
|
2603 HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
|
2604 HAMMER_INODE_DELETING
);
2605 while (RB_ROOT(&ip
->rec_tree
)) {
2606 hammer_record_t record
= RB_ROOT(&ip
->rec_tree
);
2607 hammer_ref(&record
->lock
);
2608 KKASSERT(record
->lock
.refs
== 1);
2609 record
->flags
|= HAMMER_RECF_DELETED_FE
;
2610 record
->flags
|= HAMMER_RECF_DELETED_BE
;
2611 hammer_rel_mem_record(record
);
2614 case HAMMER_INODE_ONDISK
:
2616 * If already on-disk, do not set any additional flags.
2621 * If not on-disk and not deleted, set DDIRTY to force
2622 * an initial record to be written.
2624 * Also set the create_tid in both the frontend and backend
2625 * copy of the inode record.
2627 ip
->ino_leaf
.base
.create_tid
= trans
->tid
;
2628 ip
->ino_leaf
.create_ts
= trans
->time32
;
2629 ip
->sync_ino_leaf
.base
.create_tid
= trans
->tid
;
2630 ip
->sync_ino_leaf
.create_ts
= trans
->time32
;
2631 ip
->sync_flags
|= HAMMER_INODE_DDIRTY
;
2636 * If RDIRTY or DDIRTY is set, write out a new record. If the inode
2637 * is already on-disk the old record is marked as deleted.
2639 * If DELETED is set hammer_update_inode() will delete the existing
2640 * record without writing out a new one.
2642 * If *ONLY* the ITIMES flag is set we can update the record in-place.
2644 if (ip
->flags
& HAMMER_INODE_DELETED
) {
2645 error
= hammer_update_inode(&cursor
, ip
);
2647 if ((ip
->sync_flags
& HAMMER_INODE_DDIRTY
) == 0 &&
2648 (ip
->sync_flags
& (HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
))) {
2649 error
= hammer_update_itimes(&cursor
, ip
);
2651 if (ip
->sync_flags
& (HAMMER_INODE_DDIRTY
| HAMMER_INODE_ATIME
| HAMMER_INODE_MTIME
)) {
2652 error
= hammer_update_inode(&cursor
, ip
);
2656 hammer_critical_error(ip
->hmp
, ip
, error
,
2657 "while syncing inode");
2659 hammer_done_cursor(&cursor
);
2664 * This routine is called when the OS is no longer actively referencing
2665 * the inode (but might still be keeping it cached), or when releasing
2666 * the last reference to an inode.
2668 * At this point if the inode's nlinks count is zero we want to destroy
2669 * it, which may mean destroying it on-media too.
2672 hammer_inode_unloadable_check(hammer_inode_t ip
, int getvp
)
2677 * Set the DELETING flag when the link count drops to 0 and the
2678 * OS no longer has any opens on the inode.
2680 * The backend will clear DELETING (a mod flag) and set DELETED
2681 * (a state flag) when it is actually able to perform the
2684 * Don't reflag the deletion if the flusher is currently syncing
2685 * one that was already flagged. A previously set DELETING flag
2686 * may bounce around flags and sync_flags until the operation is
2689 if (ip
->ino_data
.nlinks
== 0 &&
2690 ((ip
->flags
| ip
->sync_flags
) & (HAMMER_INODE_DELETING
|HAMMER_INODE_DELETED
)) == 0) {
2691 ip
->flags
|= HAMMER_INODE_DELETING
;
2692 ip
->flags
|= HAMMER_INODE_TRUNCATED
;
2696 if (hammer_get_vnode(ip
, &vp
) != 0)
2704 vtruncbuf(ip
->vp
, 0, HAMMER_BUFSIZE
);
2705 vnode_pager_setsize(ip
->vp
, 0);
2714 * After potentially resolving a dependancy the inode is tested
2715 * to determine whether it needs to be reflushed.
2718 hammer_test_inode(hammer_inode_t ip
)
2720 if (ip
->flags
& HAMMER_INODE_REFLUSH
) {
2721 ip
->flags
&= ~HAMMER_INODE_REFLUSH
;
2722 hammer_ref(&ip
->lock
);
2723 if (ip
->flags
& HAMMER_INODE_RESIGNAL
) {
2724 ip
->flags
&= ~HAMMER_INODE_RESIGNAL
;
2725 hammer_flush_inode(ip
, HAMMER_FLUSH_SIGNAL
);
2727 hammer_flush_inode(ip
, 0);
2729 hammer_rel_inode(ip
, 0);
2734 * Clear the RECLAIM flag on an inode. This occurs when the inode is
2735 * reassociated with a vp or just before it gets freed.
2737 * Pipeline wakeups to threads blocked due to an excessive number of
2738 * detached inodes. The reclaim count generates a bit of negative
2742 hammer_inode_wakereclaims(hammer_inode_t ip
, int dowake
)
2744 struct hammer_reclaim
*reclaim
;
2745 hammer_mount_t hmp
= ip
->hmp
;
2747 if ((ip
->flags
& HAMMER_INODE_RECLAIM
) == 0)
2750 --hammer_count_reclaiming
;
2751 --hmp
->inode_reclaims
;
2752 ip
->flags
&= ~HAMMER_INODE_RECLAIM
;
2754 if (hmp
->inode_reclaims
< HAMMER_RECLAIM_WAIT
|| dowake
) {
2755 reclaim
= TAILQ_FIRST(&hmp
->reclaim_list
);
2756 if (reclaim
&& reclaim
->count
> 0 && --reclaim
->count
== 0) {
2757 TAILQ_REMOVE(&hmp
->reclaim_list
, reclaim
, entry
);
2764 * Setup our reclaim pipeline. We only let so many detached (and dirty)
2765 * inodes build up before we start blocking.
2767 * When we block we don't care *which* inode has finished reclaiming,
2768 * as lone as one does. This is somewhat heuristical... we also put a
2769 * cap on how long we are willing to wait.
2772 hammer_inode_waitreclaims(hammer_mount_t hmp
)
2774 struct hammer_reclaim reclaim
;
2777 if (hmp
->inode_reclaims
< HAMMER_RECLAIM_WAIT
)
2779 delay
= (hmp
->inode_reclaims
- HAMMER_RECLAIM_WAIT
) * hz
/
2780 (HAMMER_RECLAIM_WAIT
* 3) + 1;
2783 TAILQ_INSERT_TAIL(&hmp
->reclaim_list
, &reclaim
, entry
);
2784 tsleep(&reclaim
, 0, "hmrrcm", delay
);
2785 if (reclaim
.count
> 0)
2786 TAILQ_REMOVE(&hmp
->reclaim_list
, &reclaim
, entry
);
2791 * A larger then normal backlog of inodes is sitting in the flusher,
2792 * enforce a general slowdown to let it catch up. This routine is only
2793 * called on completion of a non-flusher-related transaction which
2794 * performed B-Tree node I/O.
2796 * It is possible for the flusher to stall in a continuous load.
2797 * blogbench -i1000 -o seems to do a good job generating this sort of load.
2798 * If the flusher is unable to catch up the inode count can bloat until
2799 * we run out of kvm.
2801 * This is a bit of a hack.
2804 hammer_inode_waithard(hammer_mount_t hmp
)
2809 if (hmp
->flags
& HAMMER_MOUNT_FLUSH_RECOVERY
) {
2810 if (hmp
->inode_reclaims
< HAMMER_RECLAIM_WAIT
/ 2 &&
2811 hmp
->count_iqueued
< hmp
->count_inodes
/ 20) {
2812 hmp
->flags
&= ~HAMMER_MOUNT_FLUSH_RECOVERY
;
2816 if (hmp
->inode_reclaims
< HAMMER_RECLAIM_WAIT
||
2817 hmp
->count_iqueued
< hmp
->count_inodes
/ 10) {
2820 hmp
->flags
|= HAMMER_MOUNT_FLUSH_RECOVERY
;
2824 * Block for one flush cycle.
2826 hammer_flusher_wait_next(hmp
);