HAMMER 27/many: Major surgery - change allocation model
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
blob34e14e59b03386d248e85d7ba0e342323496fc66
1 /*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.29 2008/02/08 08:30:59 dillon Exp $
37 #include "hammer.h"
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
44 * Red-black tree support.
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50 return(-1);
51 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52 return(1);
54 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55 return(-1);
56 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57 return(1);
59 if (rec1->rec.base.base.create_tid == 0) {
60 if (rec2->rec.base.base.create_tid == 0)
61 return(0);
62 return(1);
64 if (rec2->rec.base.base.create_tid == 0)
65 return(-1);
67 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68 return(-1);
69 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70 return(1);
71 return(0);
74 static int
75 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
77 if (info->rec_type < rec->rec.base.base.rec_type)
78 return(-3);
79 if (info->rec_type > rec->rec.base.base.rec_type)
80 return(3);
82 if (info->key < rec->rec.base.base.key)
83 return(-2);
84 if (info->key > rec->rec.base.base.key)
85 return(2);
87 if (info->create_tid == 0) {
88 if (rec->rec.base.base.create_tid == 0)
89 return(0);
90 return(1);
92 if (rec->rec.base.base.create_tid == 0)
93 return(-1);
94 if (info->create_tid < rec->rec.base.base.create_tid)
95 return(-1);
96 if (info->create_tid > rec->rec.base.base.create_tid)
97 return(1);
98 return(0);
102 * RB_SCAN comparison code for hammer_mem_first(). The argument order
103 * is reversed so the comparison result has to be negated. key_beg and
104 * key_end are both range-inclusive.
106 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
107 * These do not stop the scan.
109 * Localized deletions are not cached in-memory.
111 static
113 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
115 hammer_cursor_t cursor = data;
116 int r;
118 r = hammer_rec_compare(&cursor->key_beg, rec);
119 if (r > 1)
120 return(-1);
121 r = hammer_rec_compare(&cursor->key_end, rec);
122 if (r < -1)
123 return(1);
124 return(0);
127 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
128 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
129 hammer_rec_compare, hammer_base_elm_t);
132 * Allocate a record for the caller to finish filling in. The record is
133 * returned referenced.
135 hammer_record_t
136 hammer_alloc_mem_record(hammer_inode_t ip, int32_t rec_len)
138 hammer_record_t record;
140 ++hammer_count_records;
141 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
142 record->ip = ip;
143 record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
144 record->rec_len = rec_len;
145 hammer_ref(&record->lock);
146 return (record);
150 * Release a memory record. Records marked for deletion are immediately
151 * removed from the RB-Tree but otherwise left intact until the last ref
152 * goes away.
154 void
155 hammer_rel_mem_record(struct hammer_record *record)
157 hammer_unref(&record->lock);
159 if (record->flags & HAMMER_RECF_DELETED) {
160 if (record->flags & HAMMER_RECF_ONRBTREE) {
161 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
162 record);
163 record->flags &= ~HAMMER_RECF_ONRBTREE;
165 if (record->lock.refs == 0) {
166 if (record->flags & HAMMER_RECF_ALLOCDATA) {
167 --hammer_count_record_datas;
168 kfree(record->data, M_HAMMER);
169 record->flags &= ~HAMMER_RECF_ALLOCDATA;
171 record->data = NULL;
172 --hammer_count_records;
173 kfree(record, M_HAMMER);
174 return;
179 * If someone wanted the record wake them up.
181 if (record->flags & HAMMER_RECF_WANTED) {
182 record->flags &= ~HAMMER_RECF_WANTED;
183 wakeup(record);
188 * Lookup an in-memory record given the key specified in the cursor. Works
189 * just like hammer_btree_lookup() but operates on an inode's in-memory
190 * record list.
192 * The lookup must fail if the record is marked for deferred deletion.
194 static
196 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
198 int error;
200 if (cursor->iprec) {
201 hammer_rel_mem_record(cursor->iprec);
202 cursor->iprec = NULL;
204 if (cursor->ip) {
205 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
206 &cursor->ip->rec_tree);
208 cursor->ip = ip;
209 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
210 cursor->scan.node = NULL;
211 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
212 &ip->rec_tree, &cursor->key_beg);
213 if (cursor->iprec == NULL) {
214 error = ENOENT;
215 } else {
216 hammer_ref(&cursor->iprec->lock);
217 error = 0;
219 return(error);
223 * hammer_mem_first() - locate the first in-memory record matching the
224 * cursor.
226 * The RB_SCAN function we use is designed as a callback. We terminate it
227 * (return -1) as soon as we get a match.
229 static
231 hammer_rec_scan_callback(hammer_record_t rec, void *data)
233 hammer_cursor_t cursor = data;
236 * We terminate on success, so this should be NULL on entry.
238 KKASSERT(cursor->iprec == NULL);
241 * Skip if the record was marked deleted
243 if (rec->flags & HAMMER_RECF_DELETED)
244 return(0);
247 * Skip if not visible due to our as-of TID
249 if (cursor->flags & HAMMER_CURSOR_ASOF) {
250 if (cursor->asof < rec->rec.base.base.create_tid)
251 return(0);
252 if (rec->rec.base.base.delete_tid &&
253 cursor->asof >= rec->rec.base.base.delete_tid) {
254 return(0);
259 * Block if currently being synchronized to disk, otherwise we
260 * may get a duplicate. Wakeup the syncer if it's stuck on
261 * the record.
263 hammer_ref(&rec->lock);
264 ++rec->blocked;
265 while (rec->flags & HAMMER_RECF_SYNCING) {
266 rec->flags |= HAMMER_RECF_WANTED;
267 tsleep(rec, 0, "hmrrc2", 0);
269 --rec->blocked;
272 * The record may have been deleted while we were blocked.
274 if (rec->flags & HAMMER_RECF_DELETED) {
275 hammer_rel_mem_record(cursor->iprec);
276 return(0);
280 * Set the matching record and stop the scan.
282 cursor->iprec = rec;
283 return(-1);
286 static
288 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
290 if (cursor->iprec) {
291 hammer_rel_mem_record(cursor->iprec);
292 cursor->iprec = NULL;
294 if (cursor->ip) {
295 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
296 &cursor->ip->rec_tree);
298 cursor->ip = ip;
299 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
301 cursor->scan.node = NULL;
302 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
303 hammer_rec_scan_callback, cursor);
306 * Adjust scan.node and keep it linked into the RB-tree so we can
307 * hold the cursor through third party modifications of the RB-tree.
309 if (cursor->iprec) {
310 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
311 return(0);
313 return(ENOENT);
316 void
317 hammer_mem_done(hammer_cursor_t cursor)
319 if (cursor->ip) {
320 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
321 &cursor->ip->rec_tree);
322 cursor->ip = NULL;
324 if (cursor->iprec) {
325 hammer_rel_mem_record(cursor->iprec);
326 cursor->iprec = NULL;
330 /************************************************************************
331 * HAMMER IN-MEMORY RECORD FUNCTIONS *
332 ************************************************************************
334 * These functions manipulate in-memory records. Such records typically
335 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
339 * Add a directory entry (dip,ncp) which references inode (ip).
341 * Note that the low 32 bits of the namekey are set temporarily to create
342 * a unique in-memory record, and may be modified a second time when the
343 * record is synchronized to disk. In particular, the low 32 bits cannot be
344 * all 0's when synching to disk, which is not handled here.
347 hammer_ip_add_directory(struct hammer_transaction *trans,
348 struct hammer_inode *dip, struct namecache *ncp,
349 struct hammer_inode *ip)
351 hammer_record_t record;
352 int error;
353 int bytes;
355 record = hammer_alloc_mem_record(dip, sizeof(struct hammer_entry_record));
357 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
358 if (++trans->hmp->namekey_iterator == 0)
359 ++trans->hmp->namekey_iterator;
361 record->rec.entry.base.base.obj_id = dip->obj_id;
362 record->rec.entry.base.base.key =
363 hammer_directory_namekey(ncp->nc_name, bytes);
364 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
365 record->rec.entry.base.base.create_tid = trans->tid;
366 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
367 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
368 record->rec.entry.obj_id = ip->obj_id;
369 record->data = (void *)ncp->nc_name;
370 record->rec.entry.base.data_len = bytes;
371 ++ip->ino_rec.ino_nlinks;
372 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
373 /* NOTE: copies record->data */
374 error = hammer_mem_add(trans, record);
375 return(error);
379 * Delete the directory entry and update the inode link count. The
380 * cursor must be seeked to the directory entry record being deleted.
382 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
384 * This function can return EDEADLK requiring the caller to terminate
385 * the cursor and retry.
388 hammer_ip_del_directory(struct hammer_transaction *trans,
389 hammer_cursor_t cursor, struct hammer_inode *dip,
390 struct hammer_inode *ip)
392 int error;
394 error = hammer_ip_delete_record(cursor, trans->tid);
397 * One less link. The file may still be open in the OS even after
398 * all links have gone away so we only try to sync if the OS has
399 * no references and nlinks falls to 0.
401 * We have to terminate the cursor before syncing the inode to
402 * avoid deadlocking against ourselves.
404 if (error == 0) {
405 --ip->ino_rec.ino_nlinks;
406 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
407 if (ip->ino_rec.ino_nlinks == 0 &&
408 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
409 hammer_done_cursor(cursor);
410 hammer_sync_inode(ip, MNT_NOWAIT, 1);
414 return(error);
418 * Add a record to an inode.
420 * The caller must allocate the record with hammer_alloc_mem_record(ip) and
421 * initialize the following additional fields:
423 * record->rec.entry.base.base.key
424 * record->rec.entry.base.base.rec_type
425 * record->rec.entry.base.base.data_len
426 * record->data (a copy will be kmalloc'd if it cannot be embedded)
429 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
431 hammer_inode_t ip = record->ip;
432 int error;
434 record->rec.base.base.obj_id = ip->obj_id;
435 record->rec.base.base.create_tid = trans->tid;
436 record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
438 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
439 /* NOTE: copies record->data */
440 error = hammer_mem_add(trans, record);
441 return(error);
445 * Sync data from a buffer cache buffer (typically) to the filesystem. This
446 * is called via the strategy called from a cached data source. This code
447 * is responsible for actually writing a data record out to the disk.
449 * This can only occur non-historically (i.e. 'current' data only).
452 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
453 int64_t offset, void *data, int bytes)
455 struct hammer_cursor cursor;
456 hammer_record_ondisk_t rec;
457 union hammer_btree_elm elm;
458 hammer_off_t rec_offset;
459 hammer_off_t data_offset;
460 void *bdata1, *bdata2;
461 int32_t data2_index;
462 int error;
464 KKASSERT((offset & HAMMER_BUFMASK) == 0);
465 KKASSERT((bytes & HAMMER_BUFMASK) == 0);
466 retry:
467 error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
468 if (error)
469 return(error);
470 cursor.key_beg.obj_id = ip->obj_id;
471 cursor.key_beg.key = offset + bytes;
472 cursor.key_beg.create_tid = trans->tid;
473 cursor.key_beg.delete_tid = 0;
474 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
475 cursor.asof = trans->tid;
476 cursor.flags |= HAMMER_CURSOR_INSERT;
479 * Issue a lookup to position the cursor.
481 error = hammer_btree_lookup(&cursor);
482 if (error == 0) {
483 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
484 offset, bytes);
485 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
486 HAMMER_BTREE_TYPE_LEAF, cursor.index);
487 error = EIO;
489 if (error != ENOENT)
490 goto done;
493 * Allocate record and data space. HAMMER_RECTYPE_DATA records
494 * can cross buffer boundaries so we may have to split our bcopy.
496 rec = hammer_alloc_record(ip->hmp, &rec_offset, HAMMER_RECTYPE_DATA,
497 sizeof(rec->data), &cursor.record_buffer,
498 &data_offset, bytes,
499 &bdata1, &bdata2, &data2_index,
500 &cursor.data_buffer, &error);
501 if (rec == NULL)
502 goto done;
505 * Fill everything in and insert our B-Tree node.
507 * NOTE: hammer_alloc_record() has already marked the related
508 * buffers as modified. If we do it again we will generate
509 * unnecessary undo elements.
511 rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
512 rec->base.base.obj_id = ip->obj_id;
513 rec->base.base.key = offset + bytes;
514 rec->base.base.create_tid = trans->tid;
515 rec->base.base.delete_tid = 0;
516 rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
517 rec->base.head.hdr_crc = crc32(data, bytes);
518 KKASSERT(rec->base.data_off == data_offset);
519 KKASSERT(rec->base.data_len == bytes);
521 if (data2_index < bytes) {
522 bcopy(data, bdata1, data2_index);
523 bcopy((char *)data + data2_index, bdata2, bytes - data2_index);
524 } else {
525 bcopy(data, bdata1, bytes);
528 elm.leaf.base = rec->base.base;
529 elm.leaf.rec_offset = rec_offset;
530 elm.leaf.data_offset = rec->base.data_off;
531 elm.leaf.data_len = bytes;
532 elm.leaf.data_crc = rec->base.head.hdr_crc;
535 * Data records can wind up on-disk before the inode itself is
536 * on-disk. One must assume data records may be on-disk if either
537 * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
539 ip->flags |= HAMMER_INODE_DONDISK;
541 error = hammer_btree_insert(&cursor, &elm);
542 if (error == 0)
543 goto done;
546 * If we fail we may be able to unwind the allocation.
548 rec->base.head.hdr_type |= HAMMER_HEAD_TYPEF_FREED;
549 hammer_unwind_fifo(ip->hmp, rec_offset);
550 done:
551 hammer_done_cursor(&cursor);
552 if (error == EDEADLK)
553 goto retry;
554 return(error);
558 * Sync an in-memory record to the disk. this is typically called via fsync
559 * from a cached record source. This code is responsible for actually
560 * writing a record out to the disk.
563 hammer_ip_sync_record(hammer_record_t record)
565 struct hammer_cursor cursor;
566 hammer_record_ondisk_t rec;
567 hammer_mount_t hmp;
568 union hammer_btree_elm elm;
569 hammer_off_t rec_offset;
570 hammer_off_t data_offset;
571 void *bdata1;
572 int32_t alloc_data_len;
573 int error;
575 hmp = record->ip->hmp;
576 retry:
578 * If the record has been deleted or is being synchronized, stop.
579 * Interlock with the syncing flag.
581 if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING))
582 return(0);
583 record->flags |= HAMMER_RECF_SYNCING;
586 * If someone other then us is referencing the record and not
587 * blocking waiting for us, we have to wait until they finish.
589 * It is possible the record got destroyed while we were blocked.
591 if (record->lock.refs > record->blocked + 1) {
592 while (record->lock.refs > record->blocked + 1) {
593 record->flags |= HAMMER_RECF_WANTED;
594 tsleep(record, 0, "hmrrc1", 0);
596 if (record->flags & HAMMER_RECF_DELETED)
597 return(0);
601 * Get a cursor
603 error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0], hmp);
604 if (error)
605 return(error);
606 cursor.key_beg = record->rec.base.base;
607 cursor.flags |= HAMMER_CURSOR_INSERT;
610 * Issue a lookup to position the cursor and locate the cluster. The
611 * target key should not exist. If we are creating a directory entry
612 * we may have to iterate the low 32 bits of the key to find an unused
613 * key.
615 for (;;) {
616 error = hammer_btree_lookup(&cursor);
617 if (error)
618 break;
619 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
620 kprintf("hammer_ip_sync_record: duplicate rec "
621 "at (%016llx)\n", record->rec.base.base.key);
622 Debugger("duplicate record1");
623 error = EIO;
624 break;
626 if (++hmp->namekey_iterator == 0)
627 ++hmp->namekey_iterator;
628 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
629 record->rec.base.base.key |= hmp->namekey_iterator;
630 cursor.key_beg.key = record->rec.base.base.key;
632 if (error != ENOENT)
633 goto done;
636 * Mark the record as undergoing synchronization. Our cursor is
637 * holding a locked B-Tree node for the insertion which interlocks
638 * anyone trying to access this record.
640 * XXX There is still a race present related to iterations. An
641 * iteration may process the record, a sync may occur, and then
642 * later process the B-Tree element for the same record.
644 * We do not try to synchronize a deleted record.
646 if (record->flags & HAMMER_RECF_DELETED) {
647 error = 0;
648 goto done;
652 * Allocate the record and data. The result buffers will be
653 * marked as being modified and further calls to
654 * hammer_modify_buffer() will result in unneeded UNDO records.
656 * Support zero-fill records.
658 if (record->data == NULL)
659 alloc_data_len = 0;
660 else
661 alloc_data_len = record->rec.base.data_len;
663 rec = hammer_alloc_record(hmp, &rec_offset,
664 record->rec.base.base.rec_type,
665 record->rec_len, &cursor.record_buffer,
666 &data_offset, alloc_data_len,
667 &bdata1, NULL, NULL,
668 NULL, &error);
670 if (rec == NULL)
671 goto done;
674 * Fill in the remaining fields and insert our B-Tree node.
676 rec->base.base = record->rec.base.base;
677 if (record->rec_len > sizeof(rec->base)) {
678 bcopy(&record->rec.base + 1, &rec->base + 1,
679 record->rec_len - sizeof(rec->base));
683 * Copy the data and deal with zero-fill support.
685 if (record->data) {
686 rec->base.head.hdr_crc = crc32(record->data, alloc_data_len);
687 KKASSERT(alloc_data_len == rec->base.data_len);
688 bcopy(record->data, bdata1, alloc_data_len);
689 } else {
690 rec->base.data_len = record->rec.base.data_len;
693 elm.leaf.base = record->rec.base.base;
694 elm.leaf.rec_offset = rec_offset;
695 elm.leaf.data_offset = data_offset;
696 elm.leaf.data_len = rec->base.data_len;
697 elm.leaf.data_crc = rec->base.head.hdr_crc;
699 error = hammer_btree_insert(&cursor, &elm);
702 * Clean up on success, or fall through on error.
704 if (error == 0) {
705 record->flags |= HAMMER_RECF_DELETED;
706 goto done;
710 * Try to unwind the fifo allocation
712 rec->base.head.hdr_type |= HAMMER_HEAD_TYPEF_FREED;
713 hammer_unwind_fifo(hmp, rec_offset);
714 done:
715 record->flags &= ~HAMMER_RECF_SYNCING;
716 hammer_done_cursor(&cursor);
717 if (error == EDEADLK)
718 goto retry;
719 return(error);
723 * Add the record to the inode's rec_tree. The low 32 bits of a directory
724 * entry's key is used to deal with hash collisions in the upper 32 bits.
725 * A unique 64 bit key is generated in-memory and may be regenerated a
726 * second time when the directory record is flushed to the on-disk B-Tree.
728 * A referenced record is passed to this function. This function
729 * eats the reference. If an error occurs the record will be deleted.
731 * A copy of the temporary record->data pointer provided by the caller
732 * will be made.
734 static
736 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
738 int bytes;
739 void *data;
742 * Make a private copy of record->data
744 if (record->data) {
746 * Try to embed the data in extra space in the record
747 * union, otherwise allocate a copy.
749 bytes = record->rec.base.data_len;
750 if (bytes <= (int)sizeof(record->rec) - record->rec_len) {
751 bcopy(record->data,
752 (char *)&record->rec + record->rec_len, bytes);
753 record->data = (void *)((char *)&record->rec +
754 record->rec_len);
755 } else {
756 ++hammer_count_record_datas;
757 data = kmalloc(bytes, M_HAMMER, M_WAITOK);
758 record->flags |= HAMMER_RECF_ALLOCDATA;
759 bcopy(record->data, data, bytes);
760 record->data = data;
765 * Insert into the RB tree, find an unused iterator if this is
766 * a directory entry.
768 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
769 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
770 record->flags |= HAMMER_RECF_DELETED;
771 hammer_rel_mem_record(record);
772 return (EEXIST);
774 if (++trans->hmp->namekey_iterator == 0)
775 ++trans->hmp->namekey_iterator;
776 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
777 record->rec.base.base.key |= trans->hmp->namekey_iterator;
779 record->flags |= HAMMER_RECF_ONRBTREE;
780 hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
781 hammer_rel_mem_record(record);
782 return(0);
785 /************************************************************************
786 * HAMMER INODE MERGED-RECORD FUNCTIONS *
787 ************************************************************************
789 * These functions augment the B-Tree scanning functions in hammer_btree.c
790 * by merging in-memory records with on-disk records.
794 * Locate a particular record either in-memory or on-disk.
796 * NOTE: This is basically a standalone routine, hammer_ip_next() may
797 * NOT be called to iterate results.
800 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
802 int error;
805 * If the element is in-memory return it without searching the
806 * on-disk B-Tree
808 error = hammer_mem_lookup(cursor, ip);
809 if (error == 0) {
810 cursor->record = &cursor->iprec->rec;
811 return(error);
813 if (error != ENOENT)
814 return(error);
817 * If the inode has on-disk components search the on-disk B-Tree.
819 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
820 return(error);
821 error = hammer_btree_lookup(cursor);
822 if (error == 0)
823 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
824 return(error);
828 * Locate the first record within the cursor's key_beg/key_end range,
829 * restricted to a particular inode. 0 is returned on success, ENOENT
830 * if no records matched the requested range, or some other error.
832 * When 0 is returned hammer_ip_next() may be used to iterate additional
833 * records within the requested range.
835 * This function can return EDEADLK, requiring the caller to terminate
836 * the cursor and try again.
839 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
841 int error;
844 * Clean up fields and setup for merged scan
846 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
847 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
848 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
849 if (cursor->iprec) {
850 hammer_rel_mem_record(cursor->iprec);
851 cursor->iprec = NULL;
855 * Search the on-disk B-Tree. hammer_btree_lookup() only does an
856 * exact lookup so if we get ENOENT we have to call the iterate
857 * function to validate the first record after the begin key.
859 * The ATEDISK flag is used by hammer_btree_iterate to determine
860 * whether it must index forwards or not. It is also used here
861 * to select the next record from in-memory or on-disk.
863 * EDEADLK can only occur if the lookup hit an empty internal
864 * element and couldn't delete it. Since this could only occur
865 * in-range, we can just iterate from the failure point.
867 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
868 error = hammer_btree_lookup(cursor);
869 if (error == ENOENT || error == EDEADLK) {
870 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
871 error = hammer_btree_iterate(cursor);
873 if (error && error != ENOENT)
874 return(error);
875 if (error == 0) {
876 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
877 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
878 } else {
879 cursor->flags |= HAMMER_CURSOR_ATEDISK;
884 * Search the in-memory record list (Red-Black tree). Unlike the
885 * B-Tree search, mem_first checks for records in the range.
887 error = hammer_mem_first(cursor, ip);
888 if (error && error != ENOENT)
889 return(error);
890 if (error == 0) {
891 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
892 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
896 * This will return the first matching record.
898 return(hammer_ip_next(cursor));
902 * Retrieve the next record in a merged iteration within the bounds of the
903 * cursor. This call may be made multiple times after the cursor has been
904 * initially searched with hammer_ip_first().
906 * 0 is returned on success, ENOENT if no further records match the
907 * requested range, or some other error code is returned.
910 hammer_ip_next(hammer_cursor_t cursor)
912 hammer_btree_elm_t elm;
913 hammer_record_t rec;
914 int error;
915 int r;
918 * Load the current on-disk and in-memory record. If we ate any
919 * records we have to get the next one.
921 * If we deleted the last on-disk record we had scanned ATEDISK will
922 * be clear and DELBTREE will be set, forcing a call to iterate. The
923 * fact that ATEDISK is clear causes iterate to re-test the 'current'
924 * element. If ATEDISK is set, iterate will skip the 'current'
925 * element.
927 * Get the next on-disk record
929 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
930 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
931 error = hammer_btree_iterate(cursor);
932 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
933 if (error == 0)
934 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
935 else
936 cursor->flags |= HAMMER_CURSOR_DISKEOF |
937 HAMMER_CURSOR_ATEDISK;
942 * Get the next in-memory record. The record can be ripped out
943 * of the RB tree so we maintain a scan_info structure to track
944 * the next node.
946 * hammer_rec_scan_cmp: Is the record still in our general range,
947 * (non-inclusive of snapshot exclusions)?
948 * hammer_rec_scan_callback: Is the record in our snapshot?
950 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
951 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
952 if (cursor->iprec) {
953 hammer_rel_mem_record(cursor->iprec);
954 cursor->iprec = NULL;
956 rec = cursor->scan.node; /* next node */
957 while (rec) {
958 if (hammer_rec_scan_cmp(rec, cursor) != 0)
959 break;
960 if (hammer_rec_scan_callback(rec, cursor) != 0)
961 break;
962 rec = hammer_rec_rb_tree_RB_NEXT(rec);
964 if (cursor->iprec) {
965 KKASSERT(cursor->iprec == rec);
966 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
967 cursor->scan.node =
968 hammer_rec_rb_tree_RB_NEXT(rec);
969 } else {
970 cursor->flags |= HAMMER_CURSOR_MEMEOF;
976 * Extract either the disk or memory record depending on their
977 * relative position.
979 error = 0;
980 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
981 case 0:
983 * Both entries valid
985 elm = &cursor->node->ondisk->elms[cursor->index];
986 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
987 if (r < 0) {
988 error = hammer_btree_extract(cursor,
989 HAMMER_CURSOR_GET_RECORD);
990 cursor->flags |= HAMMER_CURSOR_ATEDISK;
991 break;
993 /* fall through to the memory entry */
994 case HAMMER_CURSOR_ATEDISK:
996 * Only the memory entry is valid
998 cursor->record = &cursor->iprec->rec;
999 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1000 break;
1001 case HAMMER_CURSOR_ATEMEM:
1003 * Only the disk entry is valid
1005 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1006 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1007 break;
1008 default:
1010 * Neither entry is valid
1012 * XXX error not set properly
1014 cursor->record = NULL;
1015 error = ENOENT;
1016 break;
1018 return(error);
1022 * Resolve the cursor->data1/2 pointer for the current cursor position in
1023 * a merged iteration.
1026 hammer_ip_resolve_data(hammer_cursor_t cursor)
1028 int error;
1030 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1031 cursor->data1 = cursor->iprec->data;
1032 cursor->data2 = NULL;
1033 cursor->data_split = cursor->iprec->rec.base.data_len;
1034 error = 0;
1035 } else {
1036 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1038 return(error);
1042 * Delete all records within the specified range for inode ip.
1044 * NOTE: An unaligned range will cause new records to be added to cover
1045 * the edge cases. (XXX not implemented yet).
1047 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1049 * NOTE: Record keys for regular file data have to be special-cased since
1050 * they indicate the end of the range (key = base + bytes).
1053 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1054 int64_t ran_beg, int64_t ran_end)
1056 struct hammer_cursor cursor;
1057 hammer_record_ondisk_t rec;
1058 hammer_base_elm_t base;
1059 int error;
1060 int64_t off;
1062 retry:
1063 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1065 cursor.key_beg.obj_id = ip->obj_id;
1066 cursor.key_beg.create_tid = 0;
1067 cursor.key_beg.delete_tid = 0;
1068 cursor.key_beg.obj_type = 0;
1069 cursor.asof = ip->obj_asof;
1070 cursor.flags |= HAMMER_CURSOR_ASOF;
1072 cursor.key_end = cursor.key_beg;
1073 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1074 cursor.key_beg.key = ran_beg;
1075 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1076 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1077 cursor.key_end.key = ran_end;
1078 } else {
1080 * The key in the B-Tree is (base+bytes), so the first possible
1081 * matching key is ran_beg + 1.
1083 int64_t tmp64;
1085 cursor.key_beg.key = ran_beg + 1;
1086 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1087 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1089 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */
1090 if (tmp64 < ran_end)
1091 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1092 else
1093 cursor.key_end.key = ran_end + MAXPHYS + 1;
1095 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1097 error = hammer_ip_first(&cursor, ip);
1100 * Iterate through matching records and mark them as deleted.
1102 while (error == 0) {
1103 rec = cursor.record;
1104 base = &rec->base.base;
1106 KKASSERT(base->delete_tid == 0);
1109 * There may be overlap cases for regular file data. Also
1110 * remember the key for a regular file record is the offset
1111 * of the last byte of the record (base + len - 1), NOT the
1112 * base offset.
1114 #if 0
1115 kprintf("delete_range rec_type %02x\n", base->rec_type);
1116 #endif
1117 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1118 #if 0
1119 kprintf("delete_range loop key %016llx\n",
1120 base->key - rec->base.data_len);
1121 #endif
1122 off = base->key - rec->base.data_len;
1124 * Check the left edge case. We currently do not
1125 * split existing records.
1127 if (off < ran_beg) {
1128 panic("hammer left edge case %016llx %d\n",
1129 base->key, rec->base.data_len);
1133 * Check the right edge case. Note that the
1134 * record can be completely out of bounds, which
1135 * terminates the search.
1137 * base->key is exclusive of the right edge while
1138 * ran_end is inclusive of the right edge. The
1139 * (key - data_len) left boundary is inclusive.
1141 * XXX theory-check this test at some point, are
1142 * we missing a + 1 somewhere? Note that ran_end
1143 * could overflow.
1145 if (base->key - 1 > ran_end) {
1146 if (base->key - rec->base.data_len > ran_end)
1147 break;
1148 panic("hammer right edge case\n");
1153 * Mark the record and B-Tree entry as deleted. This will
1154 * also physically delete the B-Tree entry, record, and
1155 * data if the retention policy dictates. The function
1156 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1157 * uses to perform a fixup.
1159 error = hammer_ip_delete_record(&cursor, trans->tid);
1160 if (error)
1161 break;
1162 error = hammer_ip_next(&cursor);
1164 hammer_done_cursor(&cursor);
1165 if (error == EDEADLK)
1166 goto retry;
1167 if (error == ENOENT)
1168 error = 0;
1169 return(error);
1173 * Delete all records associated with an inode except the inode record
1174 * itself.
1177 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1179 struct hammer_cursor cursor;
1180 hammer_record_ondisk_t rec;
1181 hammer_base_elm_t base;
1182 int error;
1184 retry:
1185 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1187 cursor.key_beg.obj_id = ip->obj_id;
1188 cursor.key_beg.create_tid = 0;
1189 cursor.key_beg.delete_tid = 0;
1190 cursor.key_beg.obj_type = 0;
1191 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1192 cursor.key_beg.key = HAMMER_MIN_KEY;
1194 cursor.key_end = cursor.key_beg;
1195 cursor.key_end.rec_type = 0xFFFF;
1196 cursor.key_end.key = HAMMER_MAX_KEY;
1198 cursor.asof = ip->obj_asof;
1199 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1201 error = hammer_ip_first(&cursor, ip);
1204 * Iterate through matching records and mark them as deleted.
1206 while (error == 0) {
1207 rec = cursor.record;
1208 base = &rec->base.base;
1210 KKASSERT(base->delete_tid == 0);
1213 * Mark the record and B-Tree entry as deleted. This will
1214 * also physically delete the B-Tree entry, record, and
1215 * data if the retention policy dictates. The function
1216 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1217 * uses to perform a fixup.
1219 error = hammer_ip_delete_record(&cursor, trans->tid);
1220 if (error)
1221 break;
1222 error = hammer_ip_next(&cursor);
1224 hammer_done_cursor(&cursor);
1225 if (error == EDEADLK)
1226 goto retry;
1227 if (error == ENOENT)
1228 error = 0;
1229 return(error);
1233 * Delete the record at the current cursor. On success the cursor will
1234 * be positioned appropriately for an iteration but may no longer be at
1235 * a leaf node.
1237 * NOTE: This can return EDEADLK, requiring the caller to terminate the
1238 * cursor and retry.
1241 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1243 hammer_btree_elm_t elm;
1244 hammer_mount_t hmp;
1245 int error;
1246 int dodelete;
1249 * In-memory (unsynchronized) records can simply be freed.
1251 if (cursor->record == &cursor->iprec->rec) {
1252 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1253 return(0);
1257 * On-disk records are marked as deleted by updating their delete_tid.
1258 * This does not effect their position in the B-Tree (which is based
1259 * on their create_tid).
1261 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1262 elm = NULL;
1263 hmp = cursor->node->volume->hmp;
1265 dodelete = 0;
1266 if (error == 0) {
1267 error = hammer_cursor_upgrade(cursor);
1268 if (error == 0) {
1269 hammer_modify_node(cursor->node);
1270 elm = &cursor->node->ondisk->elms[cursor->index];
1271 elm->leaf.base.delete_tid = tid;
1272 hammer_modify_buffer(cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1273 cursor->record->base.base.delete_tid = tid;
1278 * If we were mounted with the nohistory option, we physically
1279 * delete the record.
1281 if (hmp->hflags & HMNT_NOHISTORY)
1282 dodelete = 1;
1284 if (error == 0 && dodelete) {
1285 error = hammer_delete_at_cursor(cursor, NULL);
1286 if (error) {
1287 panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1288 error = 0;
1291 return(error);
1295 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1297 hammer_btree_elm_t elm;
1298 hammer_off_t rec_offset;
1299 hammer_off_t data_offset;
1300 int32_t data_len;
1301 u_int8_t rec_type;
1302 int error;
1304 elm = &cursor->node->ondisk->elms[cursor->index];
1305 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1307 rec_offset = elm->leaf.rec_offset;
1308 data_offset = elm->leaf.data_offset;
1309 data_len = elm->leaf.data_len;
1310 rec_type = elm->leaf.base.rec_type;
1312 error = hammer_btree_delete(cursor);
1313 if (error == 0) {
1315 * This forces a fixup for the iteration because
1316 * the cursor is now either sitting at the 'next'
1317 * element or sitting at the end of a leaf.
1319 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1320 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1321 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1323 hammer_free_fifo(cursor->node->volume->hmp, rec_offset);
1325 #if 0
1326 kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1327 "(%d remain in cluster)\n",
1328 cluster->volume->vol_no, cluster->clu_no,
1329 rec_offset, data_offset, data_len,
1330 cluster->ondisk->stat_records);
1331 #endif
1332 return (error);
1336 * Determine whether a directory is empty or not. Returns 0 if the directory
1337 * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1340 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1342 struct hammer_cursor cursor;
1343 int error;
1345 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1347 cursor.key_beg.obj_id = ip->obj_id;
1348 cursor.key_beg.create_tid = 0;
1349 cursor.key_beg.delete_tid = 0;
1350 cursor.key_beg.obj_type = 0;
1351 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1352 cursor.key_beg.key = HAMMER_MIN_KEY;
1354 cursor.key_end = cursor.key_beg;
1355 cursor.key_end.rec_type = 0xFFFF;
1356 cursor.key_end.key = HAMMER_MAX_KEY;
1358 cursor.asof = ip->obj_asof;
1359 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1361 error = hammer_ip_first(&cursor, ip);
1362 if (error == ENOENT)
1363 error = 0;
1364 else if (error == 0)
1365 error = ENOTEMPTY;
1366 hammer_done_cursor(&cursor);
1367 return(error);