2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static int hammer_mem_lookup(hammer_cursor_t cursor
);
38 static int hammer_mem_first(hammer_cursor_t cursor
);
39 static int hammer_frontend_trunc_callback(hammer_record_t record
,
41 static int hammer_bulk_scan_callback(hammer_record_t record
, void *data
);
42 static int hammer_record_needs_overwrite_delete(hammer_record_t record
);
43 static int hammer_delete_general(hammer_cursor_t cursor
, hammer_inode_t ip
,
44 hammer_btree_leaf_elm_t leaf
);
45 static int hammer_cursor_localize_data(hammer_mount_t hmp
, hammer_data_ondisk_t data
,
46 hammer_btree_leaf_elm_t leaf
);
48 struct rec_trunc_info
{
53 struct hammer_bulk_info
{
54 hammer_record_t record
;
55 hammer_record_t conflict
;
59 * Red-black tree support. Comparison code for insertion.
62 hammer_rec_rb_compare(hammer_record_t rec1
, hammer_record_t rec2
)
64 if (rec1
->leaf
.base
.rec_type
< rec2
->leaf
.base
.rec_type
)
66 if (rec1
->leaf
.base
.rec_type
> rec2
->leaf
.base
.rec_type
)
69 if (rec1
->leaf
.base
.key
< rec2
->leaf
.base
.key
)
71 if (rec1
->leaf
.base
.key
> rec2
->leaf
.base
.key
)
75 * For search & insertion purposes records deleted by the
76 * frontend or deleted/committed by the backend are silently
77 * ignored. Otherwise pipelined insertions will get messed
80 * rec1 is greater then rec2 if rec1 is marked deleted.
81 * rec1 is less then rec2 if rec2 is marked deleted.
83 * Multiple deleted records may be present, do not return 0
84 * if both are marked deleted.
86 if (rec1
->flags
& (HAMMER_RECF_DELETED_FE
| HAMMER_RECF_DELETED_BE
|
87 HAMMER_RECF_COMMITTED
)) {
90 if (rec2
->flags
& (HAMMER_RECF_DELETED_FE
| HAMMER_RECF_DELETED_BE
|
91 HAMMER_RECF_COMMITTED
)) {
99 * Basic record comparison code similar to hammer_btree_cmp().
101 * obj_id is not compared and may not yet be assigned in the record.
104 hammer_rec_cmp(hammer_base_elm_t elm
, hammer_record_t rec
)
106 if (elm
->rec_type
< rec
->leaf
.base
.rec_type
)
108 if (elm
->rec_type
> rec
->leaf
.base
.rec_type
)
111 if (elm
->key
< rec
->leaf
.base
.key
)
113 if (elm
->key
> rec
->leaf
.base
.key
)
117 * Never match against an item deleted by the frontend
118 * or backend, or committed by the backend.
120 * elm is less then rec if rec is marked deleted.
122 if (rec
->flags
& (HAMMER_RECF_DELETED_FE
| HAMMER_RECF_DELETED_BE
|
123 HAMMER_RECF_COMMITTED
)) {
130 * Ranged scan to locate overlapping record(s). This is used by
131 * hammer_ip_get_bulk() to locate an overlapping record. We have
132 * to use a ranged scan because the keys for data records with the
133 * same file base offset can be different due to differing data_len's.
135 * NOTE: The base file offset of a data record is (key - data_len), not (key).
138 hammer_rec_overlap_cmp(hammer_record_t rec
, void *data
)
140 struct hammer_bulk_info
*info
= data
;
141 hammer_btree_leaf_elm_t leaf
= &info
->record
->leaf
;
143 if (rec
->leaf
.base
.rec_type
< leaf
->base
.rec_type
)
145 if (rec
->leaf
.base
.rec_type
> leaf
->base
.rec_type
)
151 if (leaf
->base
.rec_type
== HAMMER_RECTYPE_DATA
) {
152 /* rec_beg >= leaf_end */
153 if (rec
->leaf
.base
.key
- rec
->leaf
.data_len
>= leaf
->base
.key
)
155 /* rec_end <= leaf_beg */
156 if (rec
->leaf
.base
.key
<= leaf
->base
.key
- leaf
->data_len
)
159 if (rec
->leaf
.base
.key
< leaf
->base
.key
)
161 if (rec
->leaf
.base
.key
> leaf
->base
.key
)
166 * We have to return 0 at this point, even if DELETED_FE is set,
167 * because returning anything else will cause the scan to ignore
168 * one of the branches when we really want it to check both.
174 * RB_SCAN comparison code for hammer_mem_first(). The argument order
175 * is reversed so the comparison result has to be negated. key_beg and
176 * key_end are both range-inclusive.
178 * Localized deletions are not cached in-memory.
182 hammer_rec_scan_cmp(hammer_record_t rec
, void *data
)
184 hammer_cursor_t cursor
= data
;
187 r
= hammer_rec_cmp(&cursor
->key_beg
, rec
);
190 r
= hammer_rec_cmp(&cursor
->key_end
, rec
);
197 * This compare function is used when simply looking up key_beg.
201 hammer_rec_find_cmp(hammer_record_t rec
, void *data
)
203 hammer_cursor_t cursor
= data
;
206 r
= hammer_rec_cmp(&cursor
->key_beg
, rec
);
215 * Locate blocks within the truncation range. Partial blocks do not count.
219 hammer_rec_trunc_cmp(hammer_record_t rec
, void *data
)
221 struct rec_trunc_info
*info
= data
;
223 if (rec
->leaf
.base
.rec_type
< info
->rec_type
)
225 if (rec
->leaf
.base
.rec_type
> info
->rec_type
)
228 switch(rec
->leaf
.base
.rec_type
) {
229 case HAMMER_RECTYPE_DB
:
231 * DB record key is not beyond the truncation point, retain.
233 if (rec
->leaf
.base
.key
< info
->trunc_off
)
236 case HAMMER_RECTYPE_DATA
:
238 * DATA record offset start is not beyond the truncation point,
241 if (rec
->leaf
.base
.key
- rec
->leaf
.data_len
< info
->trunc_off
)
245 hpanic("unexpected record type");
249 * The record start is >= the truncation point, return match,
250 * the record should be destroyed.
255 RB_GENERATE(hammer_rec_rb_tree
, hammer_record
, rb_node
, hammer_rec_rb_compare
);
258 * Allocate a record for the caller to finish filling in. The record is
259 * returned referenced. In order to manually set data call this function
260 * with data_len=0 and then manually set record->leaf.data_len and
261 * record->data later.
264 hammer_alloc_mem_record(hammer_inode_t ip
, int data_len
)
266 hammer_record_t record
;
270 ++hammer_count_records
;
271 record
= kmalloc(sizeof(*record
), hmp
->m_misc
,
272 M_WAITOK
| M_ZERO
| M_USE_RESERVE
);
273 record
->flush_state
= HAMMER_FST_IDLE
;
275 record
->leaf
.base
.btype
= HAMMER_BTREE_TYPE_RECORD
;
276 record
->leaf
.data_len
= data_len
;
277 hammer_ref(&record
->lock
);
280 record
->data
= kmalloc(data_len
, hmp
->m_misc
, M_WAITOK
| M_ZERO
);
281 record
->flags
|= HAMMER_RECF_ALLOCDATA
;
282 ++hammer_count_record_datas
;
289 hammer_wait_mem_record_ident(hammer_record_t record
, const char *ident
)
291 while (record
->flush_state
== HAMMER_FST_FLUSH
) {
292 record
->flags
|= HAMMER_RECF_WANTED
;
293 tsleep(record
, 0, ident
, 0);
298 * Called from the backend, hammer_inode.c, after a record has been
299 * flushed to disk. The record has been exclusively locked by the
300 * caller and interlocked with BE.
302 * We clean up the state, unlock, and release the record (the record
303 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
306 hammer_flush_record_done(hammer_record_t record
, int error
)
308 hammer_inode_t target_ip
;
310 KKASSERT(record
->flush_state
== HAMMER_FST_FLUSH
);
311 KKASSERT(record
->flags
& HAMMER_RECF_INTERLOCK_BE
);
314 * If an error occured, the backend was unable to sync the
315 * record to its media. Leave the record intact.
318 hammer_critical_error(record
->ip
->hmp
, record
->ip
, error
,
319 "while flushing record");
322 --record
->flush_group
->refs
;
323 record
->flush_group
= NULL
;
326 * Adjust the flush state and dependancy based on success or
329 if (record
->flags
& (HAMMER_RECF_DELETED_BE
| HAMMER_RECF_COMMITTED
)) {
330 if ((target_ip
= record
->target_ip
) != NULL
) {
331 TAILQ_REMOVE(&target_ip
->target_list
, record
,
333 record
->target_ip
= NULL
;
334 hammer_test_inode(target_ip
);
336 record
->flush_state
= HAMMER_FST_IDLE
;
338 if (record
->target_ip
) {
339 record
->flush_state
= HAMMER_FST_SETUP
;
340 hammer_test_inode(record
->ip
);
341 hammer_test_inode(record
->target_ip
);
343 record
->flush_state
= HAMMER_FST_IDLE
;
346 record
->flags
&= ~HAMMER_RECF_INTERLOCK_BE
;
351 if (record
->flags
& HAMMER_RECF_WANTED
) {
352 record
->flags
&= ~HAMMER_RECF_WANTED
;
355 hammer_rel_mem_record(record
);
359 * Release a memory record. Records marked for deletion are immediately
360 * removed from the RB-Tree but otherwise left intact until the last ref
364 hammer_rel_mem_record(hammer_record_t record
)
367 hammer_reserve_t resv
;
369 hammer_inode_t target_ip
;
372 hammer_rel(&record
->lock
);
374 if (hammer_norefs(&record
->lock
)) {
376 * Upon release of the last reference wakeup any waiters.
377 * The record structure may get destroyed so callers will
378 * loop up and do a relookup.
380 * WARNING! Record must be removed from RB-TREE before we
381 * might possibly block. hammer_test_inode() can block!
387 * Upon release of the last reference a record marked deleted
388 * by the front or backend, or committed by the backend,
391 if (record
->flags
& (HAMMER_RECF_DELETED_FE
|
392 HAMMER_RECF_DELETED_BE
|
393 HAMMER_RECF_COMMITTED
)) {
394 KKASSERT(hammer_isactive(&ip
->lock
) > 0);
395 KKASSERT(record
->flush_state
!= HAMMER_FST_FLUSH
);
398 * target_ip may have zero refs, we have to ref it
399 * to prevent it from being ripped out from under
402 if ((target_ip
= record
->target_ip
) != NULL
) {
403 TAILQ_REMOVE(&target_ip
->target_list
,
404 record
, target_entry
);
405 record
->target_ip
= NULL
;
406 hammer_ref(&target_ip
->lock
);
410 * Remove the record from the RB-Tree
412 if (record
->flags
& HAMMER_RECF_ONRBTREE
) {
413 RB_REMOVE(hammer_rec_rb_tree
,
416 record
->flags
&= ~HAMMER_RECF_ONRBTREE
;
417 KKASSERT(ip
->rsv_recs
> 0);
418 if (RB_EMPTY(&ip
->rec_tree
)) {
419 ip
->flags
&= ~HAMMER_INODE_XDIRTY
;
420 ip
->sync_flags
&= ~HAMMER_INODE_XDIRTY
;
428 * We must wait for any direct-IO to complete before
429 * we can destroy the record because the bio may
430 * have a reference to it.
433 (HAMMER_RECG_DIRECT_IO
| HAMMER_RECG_DIRECT_INVAL
)) {
434 hammer_io_direct_wait(record
);
438 * Account for the completion after the direct IO
444 hmp
->rsv_databytes
-= record
->leaf
.data_len
;
446 if (RB_EMPTY(&ip
->rec_tree
))
447 hammer_test_inode(ip
);
448 if ((ip
->flags
& HAMMER_INODE_RECSW
) &&
449 ip
->rsv_recs
<= hammer_limit_inode_recs
/2) {
450 ip
->flags
&= ~HAMMER_INODE_RECSW
;
451 wakeup(&ip
->rsv_recs
);
456 * Do this test after removing record from the RB-Tree.
459 hammer_test_inode(target_ip
);
460 hammer_rel_inode(target_ip
, 0);
463 if (record
->flags
& HAMMER_RECF_ALLOCDATA
) {
464 --hammer_count_record_datas
;
465 kfree(record
->data
, hmp
->m_misc
);
466 record
->flags
&= ~HAMMER_RECF_ALLOCDATA
;
470 * Release the reservation.
472 * If the record was not committed we can theoretically
473 * undo the reservation. However, doing so might
474 * create weird edge cases with the ordering of
475 * direct writes because the related buffer cache
476 * elements are per-vnode. So we don't try.
478 if ((resv
= record
->resv
) != NULL
) {
479 /* XXX undo leaf.data_offset,leaf.data_len */
480 hammer_blockmap_reserve_complete(hmp
, resv
);
484 --hammer_count_records
;
485 kfree(record
, hmp
->m_misc
);
491 * Record visibility depends on whether the record is being accessed by
492 * the backend or the frontend. Backend tests ignore the frontend delete
493 * flag. Frontend tests do NOT ignore the backend delete/commit flags and
494 * must also check for commit races.
496 * Return non-zero if the record is visible, zero if it isn't or if it is
497 * deleted. Returns 0 if the record has been comitted (unless the special
498 * delete-visibility flag is set). A committed record must be located
499 * via the media B-Tree. Returns non-zero if the record is good.
501 * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory
502 * records to be returned. This is so pending deletions are detected
503 * when using an iterator to locate an unused hash key, or when we need
504 * to locate historical records on-disk to destroy.
508 hammer_ip_iterate_mem_good(hammer_cursor_t cursor
, hammer_record_t record
)
510 if (cursor
->flags
& HAMMER_CURSOR_DELETE_VISIBILITY
)
512 if (cursor
->flags
& HAMMER_CURSOR_BACKEND
) {
513 if (record
->flags
& (HAMMER_RECF_DELETED_BE
|
514 HAMMER_RECF_COMMITTED
)) {
518 if (record
->flags
& (HAMMER_RECF_DELETED_FE
|
519 HAMMER_RECF_DELETED_BE
|
520 HAMMER_RECF_COMMITTED
)) {
528 * This callback is used as part of the RB_SCAN function for in-memory
529 * records. We terminate it (return -1) as soon as we get a match.
531 * This routine is used by frontend code.
533 * The primary compare code does not account for ASOF lookups. This
534 * code handles that case as well as a few others.
538 hammer_rec_scan_callback(hammer_record_t rec
, void *data
)
540 hammer_cursor_t cursor
= data
;
543 * We terminate on success, so this should be NULL on entry.
545 KKASSERT(cursor
->iprec
== NULL
);
548 * Skip if the record was marked deleted or committed.
550 if (hammer_ip_iterate_mem_good(cursor
, rec
) == 0)
554 * Skip if not visible due to our as-of TID
556 if (cursor
->flags
& HAMMER_CURSOR_ASOF
) {
557 if (cursor
->asof
< rec
->leaf
.base
.create_tid
)
559 if (rec
->leaf
.base
.delete_tid
&&
560 cursor
->asof
>= rec
->leaf
.base
.delete_tid
) {
566 * ref the record. The record is protected from backend B-Tree
567 * interactions by virtue of the cursor's IP lock.
569 hammer_ref(&rec
->lock
);
572 * The record may have been deleted or committed while we
573 * were blocked. XXX remove?
575 if (hammer_ip_iterate_mem_good(cursor
, rec
) == 0) {
576 hammer_rel_mem_record(rec
);
581 * Set the matching record and stop the scan.
589 * Lookup an in-memory record given the key specified in the cursor. Works
590 * just like hammer_btree_lookup() but operates on an inode's in-memory
593 * The lookup must fail if the record is marked for deferred deletion.
595 * The API for mem/btree_lookup() does not mess with the ATE/EOF bits.
599 hammer_mem_lookup(hammer_cursor_t cursor
)
601 KKASSERT(cursor
->ip
!= NULL
);
603 hammer_rel_mem_record(cursor
->iprec
);
604 cursor
->iprec
= NULL
;
606 hammer_rec_rb_tree_RB_SCAN(&cursor
->ip
->rec_tree
, hammer_rec_find_cmp
,
607 hammer_rec_scan_callback
, cursor
);
609 return (cursor
->iprec
? 0 : ENOENT
);
613 * hammer_mem_first() - locate the first in-memory record matching the
614 * cursor within the bounds of the key range.
616 * WARNING! API is slightly different from btree_first(). hammer_mem_first()
617 * will set ATEMEM the same as MEMEOF, and does not return any error.
621 hammer_mem_first(hammer_cursor_t cursor
)
623 KKASSERT(cursor
->ip
!= NULL
);
625 hammer_rel_mem_record(cursor
->iprec
);
626 cursor
->iprec
= NULL
;
628 hammer_rec_rb_tree_RB_SCAN(&cursor
->ip
->rec_tree
, hammer_rec_scan_cmp
,
629 hammer_rec_scan_callback
, cursor
);
632 cursor
->flags
&= ~(HAMMER_CURSOR_MEMEOF
| HAMMER_CURSOR_ATEMEM
);
634 cursor
->flags
|= HAMMER_CURSOR_MEMEOF
| HAMMER_CURSOR_ATEMEM
;
636 return (cursor
->iprec
? 0 : ENOENT
);
639 /************************************************************************
640 * HAMMER IN-MEMORY RECORD FUNCTIONS *
641 ************************************************************************
643 * These functions manipulate in-memory records. Such records typically
644 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
648 * Add a directory entry (dip,ncp) which references inode (ip).
650 * Note that the low 32 bits of the namekey are set temporarily to create
651 * a unique in-memory record, and may be modified a second time when the
652 * record is synchronized to disk. In particular, the low 32 bits cannot be
653 * all 0's when synching to disk, which is not handled here.
655 * NOTE: bytes does not include any terminating \0 on name, and name might
659 hammer_ip_add_direntry(hammer_transaction_t trans
,
660 hammer_inode_t dip
, const char *name
, int bytes
,
663 struct hammer_cursor cursor
;
664 hammer_record_t record
;
666 uint32_t max_iterations
;
668 KKASSERT(dip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DIRECTORY
);
670 record
= hammer_alloc_mem_record(dip
, HAMMER_ENTRY_SIZE(bytes
));
672 record
->type
= HAMMER_MEM_RECORD_ADD
;
673 record
->leaf
.base
.localization
= dip
->obj_localization
|
674 hammer_dir_localization(dip
);
675 record
->leaf
.base
.obj_id
= dip
->obj_id
;
676 record
->leaf
.base
.key
= hammer_direntry_namekey(dip
, name
, bytes
,
678 record
->leaf
.base
.rec_type
= HAMMER_RECTYPE_DIRENTRY
;
679 record
->leaf
.base
.obj_type
= ip
->ino_leaf
.base
.obj_type
;
680 record
->data
->entry
.obj_id
= ip
->obj_id
;
681 record
->data
->entry
.localization
= ip
->obj_localization
;
682 bcopy(name
, record
->data
->entry
.name
, bytes
);
684 ++ip
->ino_data
.nlinks
;
685 ip
->ino_data
.ctime
= trans
->time
;
686 hammer_modify_inode(trans
, ip
, HAMMER_INODE_DDIRTY
);
689 * Find an unused namekey. Both the in-memory record tree and
690 * the B-Tree are checked. We do not want historically deleted
691 * names to create a collision as our iteration space may be limited,
692 * and since create_tid wouldn't match anyway an ASOF search
693 * must be used to locate collisions.
695 * delete-visibility is set so pending deletions do not give us
696 * a false-negative on our ability to use an iterator.
698 * The iterator must not rollover the key. Directory keys only
699 * use the positive key space.
701 hammer_init_cursor(trans
, &cursor
, &dip
->cache
[1], dip
);
702 cursor
.key_beg
= record
->leaf
.base
;
703 cursor
.flags
|= HAMMER_CURSOR_ASOF
;
704 cursor
.flags
|= HAMMER_CURSOR_DELETE_VISIBILITY
;
705 cursor
.asof
= ip
->obj_asof
;
707 while (hammer_ip_lookup(&cursor
) == 0) {
708 ++record
->leaf
.base
.key
;
709 KKASSERT(record
->leaf
.base
.key
> 0);
710 cursor
.key_beg
.key
= record
->leaf
.base
.key
;
711 if (--max_iterations
== 0) {
712 hammer_rel_mem_record(record
);
713 hmkprintf(trans
->hmp
, "Failed to find an unused namekey\n");
720 * The target inode and the directory entry are bound together.
722 record
->target_ip
= ip
;
723 record
->flush_state
= HAMMER_FST_SETUP
;
724 TAILQ_INSERT_TAIL(&ip
->target_list
, record
, target_entry
);
727 * The inode now has a dependancy and must be taken out of the idle
728 * state. An inode not in an idle state is given an extra reference.
730 * When transitioning to a SETUP state flag for an automatic reflush
731 * when the dependancies are disposed of if someone is waiting on
734 if (ip
->flush_state
== HAMMER_FST_IDLE
) {
735 hammer_ref(&ip
->lock
);
736 ip
->flush_state
= HAMMER_FST_SETUP
;
737 if (ip
->flags
& HAMMER_INODE_FLUSHW
)
738 ip
->flags
|= HAMMER_INODE_REFLUSH
;
740 error
= hammer_mem_add(record
);
742 dip
->ino_data
.mtime
= trans
->time
;
743 dip
->ino_data
.ctime
= trans
->time
;
744 hammer_modify_inode(trans
, dip
, HAMMER_INODE_MTIME
| HAMMER_INODE_DDIRTY
);
747 hammer_done_cursor(&cursor
);
752 * Delete the directory entry and update the inode link count. The
753 * cursor must be seeked to the directory entry record being deleted.
755 * The related inode should be share-locked by the caller. The caller is
756 * on the frontend. It could also be NULL indicating that the directory
757 * entry being removed has no related inode.
759 * This function can return EDEADLK requiring the caller to terminate
760 * the cursor, any locks, wait on the returned record, and retry.
763 hammer_ip_del_direntry(hammer_transaction_t trans
,
764 hammer_cursor_t cursor
, hammer_inode_t dip
,
767 hammer_record_t record
;
770 if (hammer_cursor_inmem(cursor
)) {
772 * In-memory (unsynchronized) records can simply be freed.
774 * Even though the HAMMER_RECF_DELETED_FE flag is ignored
775 * by the backend, we must still avoid races against the
776 * backend potentially syncing the record to the media.
778 * We cannot call hammer_ip_delete_record(), that routine may
779 * only be called from the backend.
781 record
= cursor
->iprec
;
782 if (record
->flags
& (HAMMER_RECF_INTERLOCK_BE
|
783 HAMMER_RECF_DELETED_BE
|
784 HAMMER_RECF_COMMITTED
)) {
785 KKASSERT(cursor
->deadlk_rec
== NULL
);
786 hammer_ref(&record
->lock
);
787 cursor
->deadlk_rec
= record
;
790 KKASSERT(record
->type
== HAMMER_MEM_RECORD_ADD
);
791 record
->flags
|= HAMMER_RECF_DELETED_FE
;
796 * If the record is on-disk we have to queue the deletion by
797 * the record's key. This also causes lookups to skip the
798 * record (lookups for the purposes of finding an unused
799 * directory key do not skip the record).
801 KKASSERT(dip
->flags
&
802 (HAMMER_INODE_ONDISK
| HAMMER_INODE_DONDISK
));
803 record
= hammer_alloc_mem_record(dip
, 0);
804 record
->type
= HAMMER_MEM_RECORD_DEL
;
805 record
->leaf
.base
= cursor
->leaf
->base
;
806 KKASSERT(dip
->obj_id
== record
->leaf
.base
.obj_id
);
809 * ip may be NULL, indicating the deletion of a directory
810 * entry which has no related inode.
812 record
->target_ip
= ip
;
814 record
->flush_state
= HAMMER_FST_SETUP
;
815 TAILQ_INSERT_TAIL(&ip
->target_list
, record
,
818 record
->flush_state
= HAMMER_FST_IDLE
;
822 * The inode now has a dependancy and must be taken out of
823 * the idle state. An inode not in an idle state is given
824 * an extra reference.
826 * When transitioning to a SETUP state flag for an automatic
827 * reflush when the dependancies are disposed of if someone
828 * is waiting on the inode.
830 if (ip
&& ip
->flush_state
== HAMMER_FST_IDLE
) {
831 hammer_ref(&ip
->lock
);
832 ip
->flush_state
= HAMMER_FST_SETUP
;
833 if (ip
->flags
& HAMMER_INODE_FLUSHW
)
834 ip
->flags
|= HAMMER_INODE_REFLUSH
;
837 error
= hammer_mem_add(record
);
841 * One less link. The file may still be open in the OS even after
842 * all links have gone away.
844 * We have to terminate the cursor before syncing the inode to
845 * avoid deadlocking against ourselves. XXX this may no longer
848 * If nlinks drops to zero and the vnode is inactive (or there is
849 * no vnode), call hammer_inode_unloadable_check() to zonk the
850 * inode. If we don't do this here the inode will not be destroyed
851 * on-media until we unmount.
855 --ip
->ino_data
.nlinks
; /* do before we might block */
856 ip
->ino_data
.ctime
= trans
->time
;
858 dip
->ino_data
.mtime
= trans
->time
;
859 hammer_modify_inode(trans
, dip
, HAMMER_INODE_MTIME
);
861 hammer_modify_inode(trans
, ip
, HAMMER_INODE_DDIRTY
);
862 if (ip
->ino_data
.nlinks
== 0 &&
863 (ip
->vp
== NULL
|| (ip
->vp
->v_flag
& VINACTIVE
))) {
864 hammer_done_cursor(cursor
);
865 hammer_inode_unloadable_check(ip
, 1);
866 hammer_flush_inode(ip
, 0);
875 * Add a record to an inode.
877 * The caller must allocate the record with hammer_alloc_mem_record(ip,len) and
878 * initialize the following additional fields that are not initialized by these
881 * The related inode should be share-locked by the caller. The caller is
884 * record->leaf.base.key
885 * record->leaf.base.rec_type
886 * record->leaf.base.localization
889 hammer_ip_add_record(hammer_transaction_t trans
, hammer_record_t record
)
891 hammer_inode_t ip
= record
->ip
;
894 KKASSERT(record
->leaf
.base
.localization
!= 0);
895 record
->leaf
.base
.obj_id
= ip
->obj_id
;
896 record
->leaf
.base
.obj_type
= ip
->ino_leaf
.base
.obj_type
;
897 error
= hammer_mem_add(record
);
902 * Locate a pre-existing bulk record in memory. The caller wishes to
903 * replace the record with a new one. The existing record may have a
904 * different length (and thus a different key) so we have to use an
905 * overlap check function.
907 static hammer_record_t
908 hammer_ip_get_bulk(hammer_record_t record
)
910 struct hammer_bulk_info info
;
911 hammer_inode_t ip
= record
->ip
;
913 info
.record
= record
;
914 info
.conflict
= NULL
;
915 hammer_rec_rb_tree_RB_SCAN(&ip
->rec_tree
, hammer_rec_overlap_cmp
,
916 hammer_bulk_scan_callback
, &info
);
918 return(info
.conflict
); /* may be NULL */
922 * Take records vetted by overlap_cmp. The first non-deleted record
923 * (if any) stops the scan.
926 hammer_bulk_scan_callback(hammer_record_t record
, void *data
)
928 struct hammer_bulk_info
*info
= data
;
930 if (record
->flags
& (HAMMER_RECF_DELETED_FE
| HAMMER_RECF_DELETED_BE
|
931 HAMMER_RECF_COMMITTED
)) {
934 hammer_ref(&record
->lock
);
935 info
->conflict
= record
;
936 return(-1); /* stop scan */
940 * Reserve blockmap space placemarked with an in-memory record.
942 * This routine is called by the frontend in order to be able to directly
943 * flush a buffer cache buffer. The frontend has locked the related buffer
944 * cache buffers and we should be able to manipulate any overlapping
947 * The caller is responsible for adding the returned record and deleting
948 * the returned conflicting record (if any), typically by calling
949 * hammer_ip_replace_bulk() (via hammer_io_direct_write()).
952 hammer_ip_add_bulk(hammer_inode_t ip
, off_t file_offset
, void *data
, int bytes
,
955 hammer_record_t record
;
959 * Create a record to cover the direct write. The record cannot
960 * be added to the in-memory RB tree here as it might conflict
961 * with an existing memory record. See hammer_io_direct_write().
963 * The backend is responsible for finalizing the space reserved in
966 * XXX bytes not aligned, depend on the reservation code to
967 * align the reservation.
969 record
= hammer_alloc_mem_record(ip
, 0);
970 zone
= hammer_data_zone_index(bytes
);
971 record
->resv
= hammer_blockmap_reserve(ip
->hmp
, zone
, bytes
,
972 &record
->leaf
.data_offset
,
974 if (record
->resv
== NULL
) {
975 hdkprintf("reservation failed\n");
976 hammer_rel_mem_record(record
);
979 record
->type
= HAMMER_MEM_RECORD_DATA
;
980 record
->leaf
.base
.rec_type
= HAMMER_RECTYPE_DATA
;
981 record
->leaf
.base
.obj_type
= ip
->ino_leaf
.base
.obj_type
;
982 record
->leaf
.base
.obj_id
= ip
->obj_id
;
983 record
->leaf
.base
.key
= file_offset
+ bytes
;
984 record
->leaf
.base
.localization
= ip
->obj_localization
|
985 HAMMER_LOCALIZE_MISC
;
986 record
->leaf
.data_len
= bytes
;
987 hammer_crc_set_leaf(ip
->hmp
->version
, data
, &record
->leaf
);
988 KKASSERT(*errorp
== 0);
994 * Called by hammer_io_direct_write() prior to any possible completion
995 * of the BIO to emplace the memory record associated with the I/O and
996 * to replace any prior memory record which might still be active.
998 * Setting the FE deleted flag on the old record (if any) avoids any RB
999 * tree insertion conflict, amoung other things.
1001 * This has to be done prior to the caller completing any related buffer
1002 * cache I/O or a reinstantiation of the buffer may load data from the
1003 * old media location instead of the new media location. The holding
1004 * of the locked buffer cache buffer serves to interlock the record
1005 * replacement operation.
1008 hammer_ip_replace_bulk(hammer_mount_t hmp
, hammer_record_t record
)
1010 hammer_record_t conflict
;
1011 int error __debugvar
;
1013 while ((conflict
= hammer_ip_get_bulk(record
)) != NULL
) {
1014 if ((conflict
->flags
& HAMMER_RECF_INTERLOCK_BE
) == 0) {
1015 conflict
->flags
|= HAMMER_RECF_DELETED_FE
;
1018 conflict
->flags
|= HAMMER_RECF_WANTED
;
1019 tsleep(conflict
, 0, "hmrrc3", 0);
1020 hammer_rel_mem_record(conflict
);
1022 error
= hammer_mem_add(record
);
1024 hammer_rel_mem_record(conflict
);
1025 KKASSERT(error
== 0);
1029 * Frontend truncation code. Scan in-memory records only. On-disk records
1030 * and records in a flushing state are handled by the backend. The vnops
1031 * setattr code will handle the block containing the truncation point.
1033 * Partial blocks are not deleted.
1035 * This code is only called on regular files.
1038 hammer_ip_frontend_trunc(hammer_inode_t ip
, off_t file_size
)
1040 struct rec_trunc_info info
;
1042 switch(ip
->ino_data
.obj_type
) {
1043 case HAMMER_OBJTYPE_REGFILE
:
1044 info
.rec_type
= HAMMER_RECTYPE_DATA
;
1046 case HAMMER_OBJTYPE_DBFILE
:
1047 info
.rec_type
= HAMMER_RECTYPE_DB
;
1052 info
.trunc_off
= file_size
;
1053 hammer_rec_rb_tree_RB_SCAN(&ip
->rec_tree
, hammer_rec_trunc_cmp
,
1054 hammer_frontend_trunc_callback
, &info
);
1059 * Scan callback for frontend records to destroy during a truncation.
1060 * We must ensure that DELETED_FE is set on the record or the frontend
1061 * will get confused in future read() calls.
1063 * NOTE: DELETED_FE cannot be set while the record interlock (BE) is held.
1064 * In this rare case we must wait for the interlock to be cleared.
1066 * NOTE: This function is only called on regular files. There are further
1067 * restrictions to the setting of DELETED_FE on directory records
1068 * undergoing a flush due to sensitive inode link count calculations.
1071 hammer_frontend_trunc_callback(hammer_record_t record
, void *data __unused
)
1073 if (record
->flags
& HAMMER_RECF_DELETED_FE
)
1076 if (record
->flush_state
== HAMMER_FST_FLUSH
)
1079 hammer_ref(&record
->lock
);
1080 while (record
->flags
& HAMMER_RECF_INTERLOCK_BE
)
1081 hammer_wait_mem_record_ident(record
, "hmmtrr");
1082 record
->flags
|= HAMMER_RECF_DELETED_FE
;
1083 hammer_rel_mem_record(record
);
1088 * Return 1 if the caller must check for and delete existing records
1089 * before writing out a new data record.
1091 * Return 0 if the caller can just insert the record into the B-Tree without
1095 hammer_record_needs_overwrite_delete(hammer_record_t record
)
1097 hammer_inode_t ip
= record
->ip
;
1098 int64_t file_offset
;
1101 if (ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DBFILE
)
1102 file_offset
= record
->leaf
.base
.key
;
1104 file_offset
= record
->leaf
.base
.key
- record
->leaf
.data_len
;
1105 r
= (file_offset
< ip
->save_trunc_off
);
1106 if (ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DBFILE
) {
1107 if (ip
->save_trunc_off
<= record
->leaf
.base
.key
)
1108 ip
->save_trunc_off
= record
->leaf
.base
.key
+ 1;
1110 if (ip
->save_trunc_off
< record
->leaf
.base
.key
)
1111 ip
->save_trunc_off
= record
->leaf
.base
.key
;
1117 * Backend code. Sync a record to the media.
1120 hammer_ip_sync_record_cursor(hammer_cursor_t cursor
, hammer_record_t record
)
1122 hammer_transaction_t trans
= cursor
->trans
;
1123 hammer_mount_t hmp
= trans
->hmp
;
1124 int64_t file_offset
;
1130 KKASSERT(record
->flush_state
== HAMMER_FST_FLUSH
);
1131 KKASSERT(record
->flags
& HAMMER_RECF_INTERLOCK_BE
);
1132 KKASSERT(record
->leaf
.base
.localization
!= 0);
1135 * Any direct-write related to the record must complete before we
1136 * can sync the record to the on-disk media.
1138 if (record
->gflags
& (HAMMER_RECG_DIRECT_IO
| HAMMER_RECG_DIRECT_INVAL
))
1139 hammer_io_direct_wait(record
);
1142 * If this is a bulk-data record placemarker there may be an existing
1143 * record on-disk, indicating a data overwrite. If there is the
1144 * on-disk record must be deleted before we can insert our new record.
1146 * We've synthesized this record and do not know what the create_tid
1147 * on-disk is, nor how much data it represents.
1149 * Keep in mind that (key) for data records is (base_offset + len),
1150 * not (base_offset). Also, we only want to get rid of on-disk
1151 * records since we are trying to sync our in-memory record, call
1152 * hammer_ip_delete_range() with truncating set to 1 to make sure
1153 * it skips in-memory records.
1155 * It is ok for the lookup to return ENOENT.
1157 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1158 * to call hammer_ip_delete_range() or not. This also means we must
1159 * update sync_trunc_off() as we write.
1161 if (record
->type
== HAMMER_MEM_RECORD_DATA
&&
1162 hammer_record_needs_overwrite_delete(record
)) {
1163 file_offset
= record
->leaf
.base
.key
- record
->leaf
.data_len
;
1164 bytes
= HAMMER_BUFSIZE_DOALIGN(record
->leaf
.data_len
);
1165 KKASSERT((file_offset
& HAMMER_BUFMASK
) == 0);
1166 error
= hammer_ip_delete_range(
1168 file_offset
, file_offset
+ bytes
- 1,
1170 if (error
&& error
!= ENOENT
)
1175 * If this is a general record there may be an on-disk version
1176 * that must be deleted before we can insert the new record.
1178 if (record
->type
== HAMMER_MEM_RECORD_GENERAL
) {
1179 error
= hammer_delete_general(cursor
, record
->ip
, &record
->leaf
);
1180 if (error
&& error
!= ENOENT
)
1187 hammer_normalize_cursor(cursor
);
1188 cursor
->key_beg
= record
->leaf
.base
;
1189 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
1190 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
1191 cursor
->flags
&= ~HAMMER_CURSOR_INSERT
;
1194 * Records can wind up on-media before the inode itself is on-media.
1197 record
->ip
->flags
|= HAMMER_INODE_DONDISK
;
1200 * If we are deleting a directory entry an exact match must be
1203 if (record
->type
== HAMMER_MEM_RECORD_DEL
) {
1204 error
= hammer_btree_lookup(cursor
);
1206 KKASSERT(cursor
->iprec
== NULL
);
1207 error
= hammer_ip_delete_record(cursor
, record
->ip
,
1210 record
->flags
|= HAMMER_RECF_DELETED_BE
|
1211 HAMMER_RECF_COMMITTED
;
1212 ++record
->ip
->rec_generation
;
1221 * Issue a lookup to position the cursor and locate the insertion
1222 * point. The target key should not exist. If we are creating a
1223 * directory entry we may have to iterate the low 32 bits of the
1224 * key to find an unused key.
1226 hammer_sync_lock_sh(trans
);
1227 cursor
->flags
|= HAMMER_CURSOR_INSERT
;
1228 error
= hammer_btree_lookup(cursor
);
1229 if (hammer_debug_inode
)
1230 hdkprintf("DOINSERT LOOKUP %d\n", error
);
1232 hdkprintf("duplicate rec at (%016jx)\n",
1233 (intmax_t)record
->leaf
.base
.key
);
1234 if (hammer_debug_critical
)
1235 Debugger("duplicate record1");
1239 if (error
!= ENOENT
)
1243 * Allocate the record and data. The result buffers will be
1244 * marked as being modified and further calls to
1245 * hammer_modify_buffer() will result in unneeded UNDO records.
1247 * Support zero-fill records (data == NULL and data_len != 0)
1249 if (record
->type
== HAMMER_MEM_RECORD_DATA
) {
1251 * The data portion of a bulk-data record has already been
1252 * committed to disk, we need only adjust the layer2
1253 * statistics in the same transaction as our B-Tree insert.
1255 KKASSERT(record
->leaf
.data_offset
!= 0);
1256 error
= hammer_blockmap_finalize(trans
,
1258 record
->leaf
.data_offset
,
1259 record
->leaf
.data_len
);
1260 } else if (record
->data
&& record
->leaf
.data_len
) {
1262 * Wholely cached record, with data. Allocate the data.
1264 bdata
= hammer_alloc_data(trans
, record
->leaf
.data_len
,
1265 record
->leaf
.base
.rec_type
,
1266 &record
->leaf
.data_offset
,
1267 &cursor
->data_buffer
,
1271 hammer_crc_set_leaf(hmp
->version
, record
->data
, &record
->leaf
);
1272 hammer_modify_buffer_noundo(trans
, cursor
->data_buffer
);
1273 bcopy(record
->data
, bdata
, record
->leaf
.data_len
);
1274 hammer_modify_buffer_done(cursor
->data_buffer
);
1277 * Wholely cached record, without data.
1279 record
->leaf
.data_offset
= 0;
1280 record
->leaf
.data_crc
= 0;
1283 error
= hammer_btree_insert(cursor
, &record
->leaf
, &doprop
);
1284 if (hammer_debug_inode
&& error
) {
1285 hdkprintf("BTREE INSERT error %d @ %016jx:%d key %016jx\n",
1287 (intmax_t)cursor
->node
->node_offset
,
1289 (intmax_t)record
->leaf
.base
.key
);
1293 * Our record is on-disk and we normally mark the in-memory version
1294 * as having been committed (and not BE-deleted).
1296 * If the record represented a directory deletion but we had to
1297 * sync a valid directory entry to disk due to dependancies,
1298 * we must convert the record to a covering delete so the
1299 * frontend does not have visibility on the synced entry.
1301 * WARNING: cursor's leaf pointer may have changed after do_propagation
1306 hammer_btree_do_propagation(cursor
, &record
->leaf
);
1308 if (record
->flags
& HAMMER_RECF_CONVERT_DELETE
) {
1310 * Must convert deleted directory entry add
1311 * to a directory entry delete.
1313 KKASSERT(record
->type
== HAMMER_MEM_RECORD_ADD
);
1314 record
->flags
&= ~HAMMER_RECF_DELETED_FE
;
1315 record
->type
= HAMMER_MEM_RECORD_DEL
;
1316 KKASSERT(record
->ip
->obj_id
== record
->leaf
.base
.obj_id
);
1317 KKASSERT(record
->flush_state
== HAMMER_FST_FLUSH
);
1318 record
->flags
&= ~HAMMER_RECF_CONVERT_DELETE
;
1319 KKASSERT((record
->flags
& (HAMMER_RECF_COMMITTED
|
1320 HAMMER_RECF_DELETED_BE
)) == 0);
1321 /* converted record is not yet committed */
1322 /* hammer_flush_record_done takes care of the rest */
1325 * Everything went fine and we are now done with
1328 record
->flags
|= HAMMER_RECF_COMMITTED
;
1329 ++record
->ip
->rec_generation
;
1332 if (record
->leaf
.data_offset
) {
1333 hammer_blockmap_free(trans
, record
->leaf
.data_offset
,
1334 record
->leaf
.data_len
);
1338 hammer_sync_unlock(trans
);
1344 * Add the record to the inode's rec_tree. The low 32 bits of a directory
1345 * entry's key is used to deal with hash collisions in the upper 32 bits.
1346 * A unique 64 bit key is generated in-memory and may be regenerated a
1347 * second time when the directory record is flushed to the on-disk B-Tree.
1349 * A referenced record is passed to this function. This function
1350 * eats the reference. If an error occurs the record will be deleted.
1352 * A copy of the temporary record->data pointer provided by the caller
1356 hammer_mem_add(hammer_record_t record
)
1358 hammer_mount_t hmp
= record
->ip
->hmp
;
1361 * Make a private copy of record->data
1364 KKASSERT(record
->flags
& HAMMER_RECF_ALLOCDATA
);
1367 * Insert into the RB tree. A unique key should have already
1368 * been selected if this is a directory entry.
1370 if (RB_INSERT(hammer_rec_rb_tree
, &record
->ip
->rec_tree
, record
)) {
1371 record
->flags
|= HAMMER_RECF_DELETED_FE
;
1372 hammer_rel_mem_record(record
);
1376 ++record
->ip
->rsv_recs
;
1377 record
->ip
->hmp
->rsv_databytes
+= record
->leaf
.data_len
;
1378 record
->flags
|= HAMMER_RECF_ONRBTREE
;
1379 hammer_modify_inode(NULL
, record
->ip
, HAMMER_INODE_XDIRTY
);
1380 hammer_rel_mem_record(record
);
1384 /************************************************************************
1385 * HAMMER INODE MERGED-RECORD FUNCTIONS *
1386 ************************************************************************
1388 * These functions augment the B-Tree scanning functions in hammer_btree.c
1389 * by merging in-memory records with on-disk records.
1393 * Locate a particular record either in-memory or on-disk.
1395 * NOTE: This is basically a standalone routine, hammer_ip_next() may
1396 * NOT be called to iterate results.
1399 hammer_ip_lookup(hammer_cursor_t cursor
)
1404 * If the element is in-memory return it without searching the
1407 KKASSERT(cursor
->ip
);
1408 error
= hammer_mem_lookup(cursor
);
1410 cursor
->leaf
= &cursor
->iprec
->leaf
;
1413 if (error
!= ENOENT
)
1417 * If the inode has on-disk components search the on-disk B-Tree.
1419 if ((cursor
->ip
->flags
& (HAMMER_INODE_ONDISK
|HAMMER_INODE_DONDISK
)) == 0)
1421 error
= hammer_btree_lookup(cursor
);
1423 error
= hammer_btree_extract_leaf(cursor
);
1428 * Helper for hammer_ip_first()/hammer_ip_next()
1430 * NOTE: Both ATEDISK and DISKEOF will be set the same. This sets up
1431 * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek
1432 * state if hammer_ip_next() needs to re-seek.
1436 _hammer_ip_seek_btree(hammer_cursor_t cursor
)
1438 hammer_inode_t ip
= cursor
->ip
;
1441 if (ip
->flags
& (HAMMER_INODE_ONDISK
|HAMMER_INODE_DONDISK
)) {
1442 error
= hammer_btree_lookup(cursor
);
1443 if (error
== ENOENT
|| error
== EDEADLK
) {
1444 if (hammer_debug_general
& 0x2000) {
1445 hdkprintf("error %d node %p %016jx index %d\n",
1446 error
, cursor
->node
,
1447 (intmax_t)cursor
->node
->node_offset
,
1450 cursor
->flags
&= ~HAMMER_CURSOR_ATEDISK
;
1451 error
= hammer_btree_iterate(cursor
);
1454 cursor
->flags
&= ~(HAMMER_CURSOR_DISKEOF
|
1455 HAMMER_CURSOR_ATEDISK
);
1457 cursor
->flags
|= HAMMER_CURSOR_DISKEOF
|
1458 HAMMER_CURSOR_ATEDISK
;
1459 if (error
== ENOENT
)
1463 cursor
->flags
|= HAMMER_CURSOR_DISKEOF
| HAMMER_CURSOR_ATEDISK
;
1470 * Helper for hammer_ip_next()
1472 * The caller has determined that the media cursor is further along than the
1473 * memory cursor and must be reseeked after a generation number change.
1477 _hammer_ip_reseek(hammer_cursor_t cursor
)
1479 struct hammer_base_elm save
;
1480 hammer_btree_elm_t elm
;
1481 int error __debugvar
;
1488 hkprintf("Debug: re-seeked during scan @ino=%016jx\n",
1489 (intmax_t)cursor
->ip
->obj_id
);
1490 save
= cursor
->key_beg
;
1491 cursor
->key_beg
= cursor
->iprec
->leaf
.base
;
1492 error
= _hammer_ip_seek_btree(cursor
);
1493 KKASSERT(error
== 0);
1494 cursor
->key_beg
= save
;
1497 * If the memory record was previous returned to
1498 * the caller and the media record matches
1499 * (-1/+1: only create_tid differs), then iterate
1500 * the media record to avoid a double result.
1502 if ((cursor
->flags
& HAMMER_CURSOR_ATEDISK
) == 0 &&
1503 (cursor
->flags
& HAMMER_CURSOR_LASTWASMEM
)) {
1504 elm
= &cursor
->node
->ondisk
->elms
[cursor
->index
];
1505 r
= hammer_btree_cmp(&elm
->base
, &cursor
->iprec
->leaf
.base
);
1506 if (cursor
->flags
& HAMMER_CURSOR_ASOF
) {
1507 if (r
>= -1 && r
<= 1) {
1508 hkprintf("Debug: iterated after "
1509 "re-seek (asof r=%d)\n", r
);
1510 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1515 hkprintf("Debug: iterated after "
1517 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1526 * Locate the first record within the cursor's key_beg/key_end range,
1527 * restricted to a particular inode. 0 is returned on success, ENOENT
1528 * if no records matched the requested range, or some other error.
1530 * When 0 is returned hammer_ip_next() may be used to iterate additional
1531 * records within the requested range.
1533 * This function can return EDEADLK, requiring the caller to terminate
1534 * the cursor and try again.
1538 hammer_ip_first(hammer_cursor_t cursor
)
1540 hammer_inode_t ip __debugvar
= cursor
->ip
;
1543 KKASSERT(ip
!= NULL
);
1546 * Clean up fields and setup for merged scan
1548 cursor
->flags
&= ~HAMMER_CURSOR_RETEST
;
1551 * Search the in-memory record list (Red-Black tree). Unlike the
1552 * B-Tree search, mem_first checks for records in the range.
1554 * This function will setup both ATEMEM and MEMEOF properly for
1555 * the ip iteration. ATEMEM will be set if MEMEOF is set.
1557 hammer_mem_first(cursor
);
1560 * Detect generation changes during blockages, including
1561 * blockages which occur on the initial btree search.
1563 cursor
->rec_generation
= cursor
->ip
->rec_generation
;
1566 * Initial search and result
1568 error
= _hammer_ip_seek_btree(cursor
);
1570 error
= hammer_ip_next(cursor
);
1576 * Retrieve the next record in a merged iteration within the bounds of the
1577 * cursor. This call may be made multiple times after the cursor has been
1578 * initially searched with hammer_ip_first().
1580 * There are numerous special cases in this code to deal with races between
1581 * in-memory records and on-media records.
1583 * 0 is returned on success, ENOENT if no further records match the
1584 * requested range, or some other error code is returned.
1587 hammer_ip_next(hammer_cursor_t cursor
)
1589 hammer_btree_elm_t elm
;
1590 hammer_record_t rec
;
1591 hammer_record_t tmprec
;
1597 * Get the next on-disk record
1599 * NOTE: If we deleted the last on-disk record we had scanned
1600 * ATEDISK will be clear and RETEST will be set, forcing
1601 * a call to iterate. The fact that ATEDISK is clear causes
1602 * iterate to re-test the 'current' element. If ATEDISK is
1603 * set, iterate will skip the 'current' element.
1606 if ((cursor
->flags
& HAMMER_CURSOR_DISKEOF
) == 0) {
1607 if (cursor
->flags
& (HAMMER_CURSOR_ATEDISK
|
1608 HAMMER_CURSOR_RETEST
)) {
1609 error
= hammer_btree_iterate(cursor
);
1610 cursor
->flags
&= ~HAMMER_CURSOR_RETEST
;
1612 cursor
->flags
&= ~HAMMER_CURSOR_ATEDISK
;
1613 hammer_cache_node(&cursor
->ip
->cache
[1],
1615 } else if (error
== ENOENT
) {
1616 cursor
->flags
|= HAMMER_CURSOR_DISKEOF
|
1617 HAMMER_CURSOR_ATEDISK
;
1624 * If the generation changed the backend has deleted or committed
1625 * one or more memory records since our last check.
1627 * When this case occurs if the disk cursor is > current memory record
1628 * or the disk cursor is at EOF, we must re-seek the disk-cursor.
1629 * Since the cursor is ahead it must have not yet been eaten (if
1630 * not at eof anyway). (XXX data offset case?)
1632 * NOTE: we are not doing a full check here. That will be handled
1635 * If we have exhausted all memory records we do not have to do any
1638 while (cursor
->rec_generation
!= cursor
->ip
->rec_generation
&&
1640 hkprintf("Debug: generation changed during scan @ino=%016jx\n",
1641 (intmax_t)cursor
->ip
->obj_id
);
1642 cursor
->rec_generation
= cursor
->ip
->rec_generation
;
1643 if (cursor
->flags
& HAMMER_CURSOR_MEMEOF
)
1645 if (cursor
->flags
& HAMMER_CURSOR_DISKEOF
) {
1648 KKASSERT((cursor
->flags
& HAMMER_CURSOR_ATEDISK
) == 0);
1649 elm
= &cursor
->node
->ondisk
->elms
[cursor
->index
];
1650 r
= hammer_btree_cmp(&elm
->base
,
1651 &cursor
->iprec
->leaf
.base
);
1655 * Do we re-seek the media cursor?
1658 if (_hammer_ip_reseek(cursor
))
1664 * We can now safely get the next in-memory record. We cannot
1667 * hammer_rec_scan_cmp: Is the record still in our general range,
1668 * (non-inclusive of snapshot exclusions)?
1669 * hammer_rec_scan_callback: Is the record in our snapshot?
1672 if ((cursor
->flags
& HAMMER_CURSOR_MEMEOF
) == 0) {
1674 * If the current memory record was eaten then get the next
1675 * one. Stale records are skipped.
1677 if (cursor
->flags
& HAMMER_CURSOR_ATEMEM
) {
1678 tmprec
= cursor
->iprec
;
1679 cursor
->iprec
= NULL
;
1680 rec
= hammer_rec_rb_tree_RB_NEXT(tmprec
);
1682 if (hammer_rec_scan_cmp(rec
, cursor
) != 0)
1684 if (hammer_rec_scan_callback(rec
, cursor
) != 0)
1686 rec
= hammer_rec_rb_tree_RB_NEXT(rec
);
1688 if (cursor
->iprec
) {
1689 KKASSERT(cursor
->iprec
== rec
);
1690 cursor
->flags
&= ~HAMMER_CURSOR_ATEMEM
;
1692 cursor
->flags
|= HAMMER_CURSOR_MEMEOF
;
1694 cursor
->flags
&= ~HAMMER_CURSOR_LASTWASMEM
;
1699 * MEMORY RECORD VALIDITY TEST
1701 * (We still can't block, which is why tmprec is being held so
1704 * If the memory record is no longer valid we skip it. It may
1705 * have been deleted by the frontend. If it was deleted or
1706 * committed by the backend the generation change re-seeked the
1707 * disk cursor and the record will be present there.
1709 if (error
== 0 && (cursor
->flags
& HAMMER_CURSOR_MEMEOF
) == 0) {
1710 KKASSERT(cursor
->iprec
);
1711 KKASSERT((cursor
->flags
& HAMMER_CURSOR_ATEMEM
) == 0);
1712 if (!hammer_ip_iterate_mem_good(cursor
, cursor
->iprec
)) {
1713 cursor
->flags
|= HAMMER_CURSOR_ATEMEM
;
1715 hammer_rel_mem_record(tmprec
);
1720 hammer_rel_mem_record(tmprec
);
1723 * Extract either the disk or memory record depending on their
1724 * relative position.
1727 switch(cursor
->flags
& (HAMMER_CURSOR_ATEDISK
| HAMMER_CURSOR_ATEMEM
)) {
1730 * Both entries valid. Compare the entries and nominally
1731 * return the first one in the sort order. Numerous cases
1732 * require special attention, however.
1734 elm
= &cursor
->node
->ondisk
->elms
[cursor
->index
];
1735 r
= hammer_btree_cmp(&elm
->base
, &cursor
->iprec
->leaf
.base
);
1738 * If the two entries differ only by their key (-2/2) or
1739 * create_tid (-1/1), and are DATA records, we may have a
1740 * nominal match. We have to calculate the base file
1741 * offset of the data.
1743 if (r
<= 2 && r
>= -2 && r
!= 0 &&
1744 cursor
->ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_REGFILE
&&
1745 cursor
->iprec
->type
== HAMMER_MEM_RECORD_DATA
) {
1746 int64_t base1
= elm
->leaf
.base
.key
- elm
->leaf
.data_len
;
1747 int64_t base2
= cursor
->iprec
->leaf
.base
.key
-
1748 cursor
->iprec
->leaf
.data_len
;
1754 error
= hammer_btree_extract_leaf(cursor
);
1755 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1756 cursor
->flags
&= ~HAMMER_CURSOR_LASTWASMEM
;
1761 * If the entries match exactly the memory entry is either
1762 * an on-disk directory entry deletion or a bulk data
1763 * overwrite. If it is a directory entry deletion we eat
1766 * For the bulk-data overwrite case it is possible to have
1767 * visibility into both, which simply means the syncer
1768 * hasn't gotten around to doing the delete+insert sequence
1769 * on the B-Tree. Use the memory entry and throw away the
1772 * If the in-memory record is not either of these we
1773 * probably caught the syncer while it was syncing it to
1774 * the media. Since we hold a shared lock on the cursor,
1775 * the in-memory record had better be marked deleted at
1779 if (cursor
->iprec
->type
== HAMMER_MEM_RECORD_DEL
) {
1780 if ((cursor
->flags
& HAMMER_CURSOR_DELETE_VISIBILITY
) == 0) {
1781 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1782 cursor
->flags
|= HAMMER_CURSOR_ATEMEM
;
1785 } else if (cursor
->iprec
->type
== HAMMER_MEM_RECORD_DATA
) {
1786 if ((cursor
->flags
& HAMMER_CURSOR_DELETE_VISIBILITY
) == 0) {
1787 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1789 /* fall through to memory entry */
1791 hpanic("duplicate mem/B-Tree entry %p %d %08x",
1793 cursor
->iprec
->type
,
1794 cursor
->iprec
->flags
);
1795 cursor
->flags
|= HAMMER_CURSOR_ATEMEM
;
1799 /* fall through to the memory entry */
1800 case HAMMER_CURSOR_ATEDISK
:
1802 * Only the memory entry is valid.
1804 cursor
->leaf
= &cursor
->iprec
->leaf
;
1805 cursor
->flags
|= HAMMER_CURSOR_ATEMEM
;
1806 cursor
->flags
|= HAMMER_CURSOR_LASTWASMEM
;
1809 * If the memory entry is an on-disk deletion we should have
1810 * also had found a B-Tree record. If the backend beat us
1811 * to it it would have interlocked the cursor and we should
1812 * have seen the in-memory record marked DELETED_FE.
1814 if (cursor
->iprec
->type
== HAMMER_MEM_RECORD_DEL
&&
1815 (cursor
->flags
& HAMMER_CURSOR_DELETE_VISIBILITY
) == 0) {
1816 hpanic("del-on-disk with no B-Tree entry iprec %p flags %08x",
1818 cursor
->iprec
->flags
);
1821 case HAMMER_CURSOR_ATEMEM
:
1823 * Only the disk entry is valid
1825 error
= hammer_btree_extract_leaf(cursor
);
1826 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
1827 cursor
->flags
&= ~HAMMER_CURSOR_LASTWASMEM
;
1831 * Neither entry is valid
1833 * XXX error not set properly
1835 cursor
->flags
&= ~HAMMER_CURSOR_LASTWASMEM
;
1836 cursor
->leaf
= NULL
;
1844 * Resolve the cursor->data pointer for the current cursor position in
1845 * a merged iteration.
1848 hammer_ip_resolve_data(hammer_cursor_t cursor
)
1850 hammer_record_t record
;
1853 if (hammer_cursor_inmem(cursor
)) {
1855 * The data associated with an in-memory record is usually
1856 * kmalloced, but reserve-ahead data records will have an
1857 * on-disk reference.
1859 * NOTE: Reserve-ahead data records must be handled in the
1860 * context of the related high level buffer cache buffer
1861 * to interlock against async writes.
1863 * NOTE: We might catch a direct write in-progress, in which
1864 * case we must wait for it to complete. The wait
1865 * function will also clean out any buffer aliases.
1867 * (In fact, it is possible that the write had not
1868 * even started yet).
1870 record
= cursor
->iprec
;
1871 cursor
->data
= record
->data
;
1873 if (cursor
->data
== NULL
) {
1874 hammer_io_direct_wait(record
);
1875 KKASSERT(record
->leaf
.base
.rec_type
==
1876 HAMMER_RECTYPE_DATA
);
1877 cursor
->data
= hammer_bread_ext(cursor
->trans
->hmp
,
1878 record
->leaf
.data_offset
,
1879 record
->leaf
.data_len
,
1881 &cursor
->data_buffer
);
1885 * Loading leaf here isn't necessary if it's guaranteed that
1886 * the cursor is at a leaf node (which basically should be)
1887 * because hammer_btree_extract_data() does that.
1889 cursor
->leaf
= &cursor
->node
->ondisk
->elms
[cursor
->index
].leaf
;
1890 error
= hammer_btree_extract_data(cursor
);
1896 * Backend truncation / record replacement - delete records in range.
1898 * Delete all records within the specified range for inode ip. In-memory
1899 * records still associated with the frontend are ignored.
1901 * If truncating is non-zero in-memory records associated with the back-end
1902 * are ignored. If truncating is > 1 we can return EWOULDBLOCK.
1906 * * An unaligned range will cause new records to be added to cover
1907 * the edge cases. (XXX not implemented yet).
1909 * * Replacement via reservations (see hammer_ip_sync_record_cursor())
1910 * also do not deal with unaligned ranges.
1912 * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1914 * * Record keys for regular file data have to be special-cased since
1915 * they indicate the end of the range (key = base + bytes).
1917 * * This function may be asked to delete ridiculously huge ranges, for
1918 * example if someone truncates or removes a 1TB regular file. We
1919 * must be very careful on restarts and we may have to stop w/
1920 * EWOULDBLOCK to avoid blowing out the buffer cache.
1923 hammer_ip_delete_range(hammer_cursor_t cursor
, hammer_inode_t ip
,
1924 int64_t ran_beg
, int64_t ran_end
, int truncating
)
1926 hammer_transaction_t trans
= cursor
->trans
;
1927 hammer_btree_leaf_elm_t leaf
;
1932 KKASSERT(trans
->type
== HAMMER_TRANS_FLS
);
1934 hammer_normalize_cursor(cursor
);
1935 cursor
->key_beg
.localization
= ip
->obj_localization
|
1936 HAMMER_LOCALIZE_MISC
;
1937 cursor
->key_beg
.obj_id
= ip
->obj_id
;
1938 cursor
->key_beg
.create_tid
= 0;
1939 cursor
->key_beg
.delete_tid
= 0;
1940 cursor
->key_beg
.obj_type
= 0;
1942 if (ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DBFILE
) {
1943 cursor
->key_beg
.key
= ran_beg
;
1944 cursor
->key_beg
.rec_type
= HAMMER_RECTYPE_DB
;
1947 * The key in the B-Tree is (base+bytes), so the first possible
1948 * matching key is ran_beg + 1.
1950 cursor
->key_beg
.key
= ran_beg
+ 1;
1951 cursor
->key_beg
.rec_type
= HAMMER_RECTYPE_DATA
;
1954 cursor
->key_end
= cursor
->key_beg
;
1955 if (ip
->ino_data
.obj_type
== HAMMER_OBJTYPE_DBFILE
) {
1956 cursor
->key_end
.key
= ran_end
;
1958 tmp64
= ran_end
+ MAXPHYS
+ 1; /* work around GCC-4 bug */
1959 if (tmp64
< ran_end
)
1960 cursor
->key_end
.key
= HAMMER_MAX_KEY
;
1962 cursor
->key_end
.key
= ran_end
+ MAXPHYS
+ 1;
1965 cursor
->asof
= ip
->obj_asof
;
1966 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
1967 cursor
->flags
|= HAMMER_CURSOR_ASOF
;
1968 cursor
->flags
|= HAMMER_CURSOR_DELETE_VISIBILITY
;
1969 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
1970 cursor
->flags
|= HAMMER_CURSOR_END_INCLUSIVE
;
1972 error
= hammer_ip_first(cursor
);
1975 * Iterate through matching records and mark them as deleted.
1977 while (error
== 0) {
1978 leaf
= cursor
->leaf
;
1980 KKASSERT(leaf
->base
.delete_tid
== 0);
1981 KKASSERT(leaf
->base
.obj_id
== ip
->obj_id
);
1984 * There may be overlap cases for regular file data. Also
1985 * remember the key for a regular file record is (base + len),
1988 * Note that due to duplicates (mem & media) allowed by
1989 * DELETE_VISIBILITY, off can wind up less then ran_beg.
1991 if (leaf
->base
.rec_type
== HAMMER_RECTYPE_DATA
) {
1992 off
= leaf
->base
.key
- leaf
->data_len
;
1994 * Check the left edge case. We currently do not
1995 * split existing records.
1997 if (off
< ran_beg
&& leaf
->base
.key
> ran_beg
) {
1998 hpanic("hammer left edge case %016jx %d",
1999 (intmax_t)leaf
->base
.key
,
2004 * Check the right edge case. Note that the
2005 * record can be completely out of bounds, which
2006 * terminates the search.
2008 * base->key is exclusive of the right edge while
2009 * ran_end is inclusive of the right edge. The
2010 * (key - data_len) left boundary is inclusive.
2012 * XXX theory-check this test at some point, are
2013 * we missing a + 1 somewhere? Note that ran_end
2016 if (leaf
->base
.key
- 1 > ran_end
) {
2017 if (leaf
->base
.key
- leaf
->data_len
> ran_end
)
2019 hpanic("hammer right edge case");
2022 off
= leaf
->base
.key
;
2026 * Delete the record. When truncating we do not delete
2027 * in-memory (data) records because they represent data
2028 * written after the truncation.
2030 * This will also physically destroy the B-Tree entry and
2031 * data if the retention policy dictates. The function
2032 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
2033 * to retest the new 'current' element.
2035 if (truncating
== 0 || hammer_cursor_ondisk(cursor
)) {
2036 error
= hammer_ip_delete_record(cursor
, ip
, trans
->tid
);
2038 * If we have built up too many meta-buffers we risk
2039 * deadlocking the kernel and must stop. This can
2040 * occur when deleting ridiculously huge files.
2041 * sync_trunc_off is updated so the next cycle does
2042 * not re-iterate records we have already deleted.
2044 * This is only done with formal truncations.
2046 if (truncating
> 1 && error
== 0 &&
2047 hammer_flusher_meta_limit(ip
->hmp
)) {
2048 ip
->sync_trunc_off
= off
;
2049 error
= EWOULDBLOCK
;
2054 ran_beg
= off
; /* for restart */
2055 error
= hammer_ip_next(cursor
);
2058 hammer_cache_node(&ip
->cache
[1], cursor
->node
);
2060 if (error
== EDEADLK
) {
2061 hammer_done_cursor(cursor
);
2062 error
= hammer_init_cursor(trans
, cursor
, &ip
->cache
[1], ip
);
2066 if (error
== ENOENT
)
2072 * This backend function deletes the specified record on-disk, similar to
2073 * delete_range but for a specific record. Unlike the exact deletions
2074 * used when deleting a directory entry this function uses an ASOF search
2075 * like delete_range.
2077 * This function may be called with ip->obj_asof set for a slave snapshot,
2078 * so don't use it. We always delete non-historical records only.
2081 hammer_delete_general(hammer_cursor_t cursor
, hammer_inode_t ip
,
2082 hammer_btree_leaf_elm_t leaf
)
2084 hammer_transaction_t trans
= cursor
->trans
;
2087 KKASSERT(trans
->type
== HAMMER_TRANS_FLS
);
2089 hammer_normalize_cursor(cursor
);
2090 cursor
->key_beg
= leaf
->base
;
2091 cursor
->asof
= HAMMER_MAX_TID
;
2092 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
2093 cursor
->flags
|= HAMMER_CURSOR_ASOF
;
2094 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
2095 cursor
->flags
&= ~HAMMER_CURSOR_INSERT
;
2097 error
= hammer_btree_lookup(cursor
);
2099 error
= hammer_ip_delete_record(cursor
, ip
, trans
->tid
);
2101 if (error
== EDEADLK
) {
2102 hammer_done_cursor(cursor
);
2103 error
= hammer_init_cursor(trans
, cursor
, &ip
->cache
[1], ip
);
2111 * This function deletes remaining auxillary records when an inode is
2112 * being deleted. This function explicitly does not delete the
2113 * inode record, directory entry, data, or db records. Those must be
2114 * properly disposed of prior to this call.
2117 hammer_ip_delete_clean(hammer_cursor_t cursor
, hammer_inode_t ip
, int *countp
)
2119 hammer_transaction_t trans
= cursor
->trans
;
2120 hammer_btree_leaf_elm_t leaf __debugvar
;
2123 KKASSERT(trans
->type
== HAMMER_TRANS_FLS
);
2125 hammer_normalize_cursor(cursor
);
2126 cursor
->key_beg
.localization
= ip
->obj_localization
|
2127 HAMMER_LOCALIZE_MISC
;
2128 cursor
->key_beg
.obj_id
= ip
->obj_id
;
2129 cursor
->key_beg
.create_tid
= 0;
2130 cursor
->key_beg
.delete_tid
= 0;
2131 cursor
->key_beg
.obj_type
= 0;
2132 cursor
->key_beg
.rec_type
= HAMMER_RECTYPE_CLEAN_START
;
2133 cursor
->key_beg
.key
= HAMMER_MIN_KEY
;
2135 cursor
->key_end
= cursor
->key_beg
;
2136 cursor
->key_end
.rec_type
= HAMMER_RECTYPE_MAX
;
2137 cursor
->key_end
.key
= HAMMER_MAX_KEY
;
2139 cursor
->asof
= ip
->obj_asof
;
2140 cursor
->flags
&= ~HAMMER_CURSOR_INITMASK
;
2141 cursor
->flags
|= HAMMER_CURSOR_END_INCLUSIVE
| HAMMER_CURSOR_ASOF
;
2142 cursor
->flags
|= HAMMER_CURSOR_DELETE_VISIBILITY
;
2143 cursor
->flags
|= HAMMER_CURSOR_BACKEND
;
2145 error
= hammer_ip_first(cursor
);
2148 * Iterate through matching records and mark them as deleted.
2150 while (error
== 0) {
2151 leaf
= cursor
->leaf
;
2153 KKASSERT(leaf
->base
.delete_tid
== 0);
2156 * Mark the record and B-Tree entry as deleted. This will
2157 * also physically delete the B-Tree entry, record, and
2158 * data if the retention policy dictates. The function
2159 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next()
2160 * to retest the new 'current' element.
2162 * Directory entries (and delete-on-disk directory entries)
2163 * must be synced and cannot be deleted.
2165 error
= hammer_ip_delete_record(cursor
, ip
, trans
->tid
);
2169 error
= hammer_ip_next(cursor
);
2172 hammer_cache_node(&ip
->cache
[1], cursor
->node
);
2173 if (error
== EDEADLK
) {
2174 hammer_done_cursor(cursor
);
2175 error
= hammer_init_cursor(trans
, cursor
, &ip
->cache
[1], ip
);
2179 if (error
== ENOENT
)
2185 * Delete the record at the current cursor. On success the cursor will
2186 * be positioned appropriately for an iteration but may no longer be at
2189 * This routine is only called from the backend.
2191 * NOTE: This can return EDEADLK, requiring the caller to terminate the
2195 hammer_ip_delete_record(hammer_cursor_t cursor
, hammer_inode_t ip
,
2198 hammer_record_t iprec
;
2201 KKASSERT(cursor
->flags
& HAMMER_CURSOR_BACKEND
);
2205 * In-memory (unsynchronized) records can simply be freed. This
2206 * only occurs in range iterations since all other records are
2207 * individually synchronized. Thus there should be no confusion with
2210 * An in-memory record may be deleted before being committed to disk,
2211 * but could have been accessed in the mean time. The reservation
2212 * code will deal with the case.
2214 if (hammer_cursor_inmem(cursor
)) {
2215 iprec
= cursor
->iprec
;
2216 KKASSERT((iprec
->flags
& HAMMER_RECF_INTERLOCK_BE
) ==0);
2217 iprec
->flags
|= HAMMER_RECF_DELETED_FE
;
2218 iprec
->flags
|= HAMMER_RECF_DELETED_BE
;
2219 KKASSERT(iprec
->ip
== ip
);
2220 ++ip
->rec_generation
;
2225 * On-disk records are marked as deleted by updating their delete_tid.
2226 * This does not effect their position in the B-Tree (which is based
2227 * on their create_tid).
2229 * Frontend B-Tree operations track inodes so we tell
2230 * hammer_delete_at_cursor() not to.
2232 error
= hammer_btree_extract_leaf(cursor
);
2234 error
= hammer_delete_at_cursor(
2236 HAMMER_DELETE_ADJUST
| hammer_nohistory(ip
),
2238 cursor
->trans
->time32
,
2245 * Used to write a generic record w/optional data to the media b-tree
2246 * when no inode context is available. Used by the mirroring and
2249 * Caller must set cursor->key_beg to leaf->base. The cursor must be
2250 * flagged for backend operation and not flagged ASOF (since we are
2251 * doing an insertion).
2253 * This function will acquire the appropriate sync lock and will set
2254 * the cursor insertion flag for the operation, do the btree lookup,
2255 * and the insertion, and clear the insertion flag and sync lock before
2256 * returning. The cursor state will be such that the caller can continue
2257 * scanning (used by the mirroring code).
2259 * mode: HAMMER_CREATE_MODE_UMIRROR copyin data, check crc
2260 * HAMMER_CREATE_MODE_SYS bcopy data, generate crc
2262 * NOTE: EDEADLK can be returned. The caller must do deadlock handling and
2265 * EALREADY can be returned if the record already exists (WARNING,
2266 * because ASOF cannot be used no check is made for illegal
2269 * NOTE: Do not use the function for normal inode-related records as this
2270 * functions goes directly to the media and is not integrated with
2271 * in-memory records.
2274 hammer_create_at_cursor(hammer_cursor_t cursor
, hammer_btree_leaf_elm_t leaf
,
2275 void *udata
, int mode
)
2277 hammer_transaction_t trans
;
2279 hammer_buffer_t data_buffer
;
2280 hammer_off_t ndata_offset
;
2281 hammer_tid_t high_tid
;
2286 trans
= cursor
->trans
;
2292 KKASSERT((cursor
->flags
&
2293 (HAMMER_CURSOR_BACKEND
| HAMMER_CURSOR_ASOF
)) ==
2294 (HAMMER_CURSOR_BACKEND
));
2296 hammer_sync_lock_sh(trans
);
2298 if (leaf
->data_len
) {
2299 ndata
= hammer_alloc_data(trans
, leaf
->data_len
,
2300 leaf
->base
.rec_type
,
2301 &ndata_offset
, &data_buffer
,
2303 if (ndata
== NULL
) {
2304 hammer_sync_unlock(trans
);
2307 leaf
->data_offset
= ndata_offset
;
2308 hammer_modify_buffer_noundo(trans
, data_buffer
);
2311 case HAMMER_CREATE_MODE_UMIRROR
:
2312 error
= copyin(udata
, ndata
, leaf
->data_len
);
2314 if (hammer_crc_test_leaf(hmp
->version
, ndata
, leaf
) == 0) {
2315 hdkprintf("CRC DATA @ %016jx/%d MISMATCH ON PIPE\n",
2316 (intmax_t)ndata_offset
,
2320 error
= hammer_cursor_localize_data(
2325 case HAMMER_CREATE_MODE_SYS
:
2326 bcopy(udata
, ndata
, leaf
->data_len
);
2328 hammer_crc_set_leaf(hmp
->version
, ndata
, leaf
);
2331 hpanic("bad mode %d", mode
);
2332 break; /* NOT REACHED */
2334 hammer_modify_buffer_done(data_buffer
);
2336 leaf
->data_offset
= 0;
2344 * Do the insertion. This can fail with a EDEADLK or EALREADY
2346 cursor
->flags
|= HAMMER_CURSOR_INSERT
;
2347 error
= hammer_btree_lookup(cursor
);
2348 if (error
!= ENOENT
) {
2353 error
= hammer_btree_insert(cursor
, leaf
, &doprop
);
2356 * Cursor is left on current element, we want to skip it now.
2357 * (in case the caller is scanning)
2359 cursor
->flags
|= HAMMER_CURSOR_ATEDISK
;
2360 cursor
->flags
&= ~HAMMER_CURSOR_INSERT
;
2363 * If the insertion happens to be creating (and not just replacing)
2364 * an inode we have to track it.
2367 leaf
->base
.rec_type
== HAMMER_RECTYPE_INODE
&&
2368 leaf
->base
.delete_tid
== 0) {
2369 hammer_modify_volume_field(trans
, trans
->rootvol
,
2371 ++trans
->hmp
->rootvol
->ondisk
->vol0_stat_inodes
;
2372 hammer_modify_volume_done(trans
->rootvol
);
2376 * vol0_next_tid must track the highest TID stored in the filesystem.
2377 * We do not need to generate undo for this update.
2379 high_tid
= leaf
->base
.create_tid
;
2380 if (high_tid
< leaf
->base
.delete_tid
)
2381 high_tid
= leaf
->base
.delete_tid
;
2382 if (trans
->rootvol
->ondisk
->vol0_next_tid
< high_tid
) {
2383 hammer_modify_volume_noundo(trans
, trans
->rootvol
);
2384 trans
->rootvol
->ondisk
->vol0_next_tid
= high_tid
;
2385 hammer_modify_volume_done(trans
->rootvol
);
2389 * WARNING! cursor's leaf pointer may have changed after
2390 * do_propagation returns.
2392 if (error
== 0 && doprop
)
2393 hammer_btree_do_propagation(cursor
, leaf
);
2399 if (error
&& leaf
->data_offset
) {
2400 hammer_blockmap_free(trans
, leaf
->data_offset
, leaf
->data_len
);
2403 hammer_sync_unlock(trans
);
2405 hammer_rel_buffer(data_buffer
, 0);
2410 * Delete the B-Tree element at the current cursor and do any necessary
2411 * mirror propagation.
2413 * The cursor must be properly positioned for an iteration on return but
2414 * may be pointing at an internal element.
2416 * An element can be un-deleted by passing a delete_tid of 0 with
2417 * HAMMER_DELETE_ADJUST.
2419 * This function will store the number of bytes deleted in *stat_bytes
2420 * if stat_bytes is not NULL.
2423 hammer_delete_at_cursor(hammer_cursor_t cursor
, int delete_flags
,
2424 hammer_tid_t delete_tid
, uint32_t delete_ts
,
2425 int track
, int64_t *stat_bytes
)
2427 struct hammer_btree_leaf_elm save_leaf
;
2428 hammer_transaction_t trans
;
2429 hammer_btree_leaf_elm_t leaf
;
2431 hammer_btree_elm_t elm
;
2432 hammer_off_t data_offset
;
2440 error
= hammer_cursor_upgrade(cursor
);
2444 trans
= cursor
->trans
;
2445 node
= cursor
->node
;
2446 elm
= &node
->ondisk
->elms
[cursor
->index
];
2448 KKASSERT(elm
->base
.btype
== HAMMER_BTREE_TYPE_RECORD
);
2450 hammer_sync_lock_sh(trans
);
2456 * Adjust the delete_tid. Update the mirror_tid propagation field
2457 * as well. delete_tid can be 0 (undelete -- used by mirroring).
2459 if (delete_flags
& HAMMER_DELETE_ADJUST
) {
2460 if (elm
->base
.rec_type
== HAMMER_RECTYPE_INODE
) {
2461 if (elm
->leaf
.base
.delete_tid
== 0 && delete_tid
)
2463 if (elm
->leaf
.base
.delete_tid
&& delete_tid
== 0)
2467 hammer_modify_node(trans
, node
, elm
, sizeof(*elm
));
2468 elm
->leaf
.base
.delete_tid
= delete_tid
;
2469 elm
->leaf
.delete_ts
= delete_ts
;
2470 hammer_modify_node_done(node
);
2472 if (elm
->leaf
.base
.delete_tid
> node
->ondisk
->mirror_tid
) {
2473 hammer_modify_node_field(trans
, node
, mirror_tid
);
2474 node
->ondisk
->mirror_tid
= elm
->leaf
.base
.delete_tid
;
2475 hammer_modify_node_done(node
);
2477 if (hammer_debug_general
& 0x0002) {
2478 hdkprintf("propagate %016jx @%016jx\n",
2479 (intmax_t)elm
->leaf
.base
.delete_tid
,
2480 (intmax_t)node
->node_offset
);
2485 * Adjust for the iteration. We have deleted the current
2486 * element and want to clear ATEDISK so the iteration does
2487 * not skip the element after, which now becomes the current
2488 * element. This element must be re-tested if doing an
2489 * iteration, which is handled by the RETEST flag.
2491 if ((cursor
->flags
& HAMMER_CURSOR_DISKEOF
) == 0) {
2492 cursor
->flags
|= HAMMER_CURSOR_RETEST
;
2493 cursor
->flags
&= ~HAMMER_CURSOR_ATEDISK
;
2497 * An on-disk record cannot have the same delete_tid
2498 * as its create_tid. In a chain of record updates
2499 * this could result in a duplicate record.
2501 KKASSERT(elm
->leaf
.base
.delete_tid
!=
2502 elm
->leaf
.base
.create_tid
);
2506 * Destroy the B-Tree element if asked (typically if a nohistory
2507 * file or mount, or when called by the pruning code).
2509 * Adjust the ATEDISK flag to properly support iterations.
2511 if (delete_flags
& HAMMER_DELETE_DESTROY
) {
2512 data_offset
= elm
->leaf
.data_offset
;
2513 data_len
= elm
->leaf
.data_len
;
2515 save_leaf
= elm
->leaf
;
2518 if (elm
->base
.rec_type
== HAMMER_RECTYPE_INODE
&&
2519 elm
->leaf
.base
.delete_tid
== 0) {
2523 error
= hammer_btree_delete(cursor
, &ndelete
);
2526 * The deletion moves the next element (if any) to
2527 * the current element position. We must clear
2528 * ATEDISK so this element is not skipped and we
2529 * must set RETEST to force any iteration to re-test
2532 if ((cursor
->flags
& HAMMER_CURSOR_DISKEOF
) == 0) {
2533 cursor
->flags
|= HAMMER_CURSOR_RETEST
;
2534 cursor
->flags
&= ~HAMMER_CURSOR_ATEDISK
;
2536 bytes
+= (ndelete
* sizeof(struct hammer_node_ondisk
));
2538 switch(HAMMER_ZONE(data_offset
)) {
2539 case HAMMER_ZONE_LARGE_DATA
:
2540 case HAMMER_ZONE_SMALL_DATA
:
2541 case HAMMER_ZONE_META
:
2542 hammer_blockmap_free(trans
,
2543 data_offset
, data_len
);
2553 * Track inode count and next_tid. This is used by the mirroring
2554 * and PFS code. icount can be negative, zero, or positive.
2556 if (error
== 0 && track
) {
2558 hammer_modify_volume_field(trans
, trans
->rootvol
,
2560 trans
->rootvol
->ondisk
->vol0_stat_inodes
+= icount
;
2561 hammer_modify_volume_done(trans
->rootvol
);
2563 if (trans
->rootvol
->ondisk
->vol0_next_tid
< delete_tid
) {
2564 hammer_modify_volume_noundo(trans
, trans
->rootvol
);
2565 trans
->rootvol
->ondisk
->vol0_next_tid
= delete_tid
;
2566 hammer_modify_volume_done(trans
->rootvol
);
2571 * mirror_tid propagation occurs if the node's mirror_tid had to be
2572 * updated while adjusting the delete_tid.
2574 * This occurs when deleting even in nohistory mode, but does not
2575 * occur when pruning an already-deleted node.
2577 * cursor->ip is NULL when called from the pruning, mirroring,
2578 * and pfs code. If non-NULL propagation will be conditionalized
2579 * on whether the PFS is in no-history mode or not.
2581 * WARNING: cursor's leaf pointer may have changed after do_propagation
2586 hammer_btree_do_propagation(cursor
, leaf
);
2588 hammer_btree_do_propagation(cursor
, leaf
);
2591 *stat_bytes
= bytes
;
2592 hammer_sync_unlock(trans
);
2597 * Determine whether we can remove a directory. This routine checks whether
2598 * a directory is empty or not and enforces flush connectivity.
2600 * Flush connectivity requires that we block if the target directory is
2601 * currently flushing, otherwise it may not end up in the same flush group.
2603 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2606 hammer_ip_check_directory_empty(hammer_transaction_t trans
, hammer_inode_t ip
)
2608 struct hammer_cursor cursor
;
2612 * Check directory empty
2614 hammer_init_cursor(trans
, &cursor
, &ip
->cache
[1], ip
);
2616 cursor
.key_beg
.localization
= ip
->obj_localization
|
2617 hammer_dir_localization(ip
);
2618 cursor
.key_beg
.obj_id
= ip
->obj_id
;
2619 cursor
.key_beg
.create_tid
= 0;
2620 cursor
.key_beg
.delete_tid
= 0;
2621 cursor
.key_beg
.obj_type
= 0;
2622 cursor
.key_beg
.rec_type
= HAMMER_RECTYPE_ENTRY_START
;
2623 cursor
.key_beg
.key
= HAMMER_MIN_KEY
;
2625 cursor
.key_end
= cursor
.key_beg
;
2626 cursor
.key_end
.rec_type
= HAMMER_RECTYPE_MAX
;
2627 cursor
.key_end
.key
= HAMMER_MAX_KEY
;
2629 cursor
.asof
= ip
->obj_asof
;
2630 cursor
.flags
|= HAMMER_CURSOR_END_INCLUSIVE
| HAMMER_CURSOR_ASOF
;
2632 error
= hammer_ip_first(&cursor
);
2633 if (error
== ENOENT
)
2635 else if (error
== 0)
2637 hammer_done_cursor(&cursor
);
2642 * Localize the data payload. Directory entries may need their
2643 * localization adjusted.
2647 hammer_cursor_localize_data(hammer_mount_t hmp
, hammer_data_ondisk_t data
,
2648 hammer_btree_leaf_elm_t leaf
)
2650 uint32_t localization
;
2652 if (leaf
->base
.rec_type
== HAMMER_RECTYPE_DIRENTRY
) {
2653 localization
= leaf
->base
.localization
&
2654 HAMMER_LOCALIZE_PSEUDOFS_MASK
;
2655 if (data
->entry
.localization
!= localization
) {
2656 data
->entry
.localization
= localization
;
2657 hammer_crc_set_leaf(hmp
->version
, data
, leaf
);