HAMMER 6/many - memory->disk flush, single-cluster sync to disk, more vnops.
[dragonfly/netmp.git] / sys / vfs / hammer / hammer_object.c
blobaff9ac83ff22e920d3512f78776da39b48820429
1 /*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.5 2007/11/26 05:03:11 dillon Exp $
37 #include "hammer.h"
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip);
45 * Red-black tree support.
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
50 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 return(-1);
52 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53 return(1);
55 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 return(-1);
57 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58 return(1);
60 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61 return(-1);
62 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63 return(1);
64 return(0);
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
71 * A key1->rec_type of 0 matches any record type.
73 if (info->rec_type) {
74 if (info->rec_type < rec->rec.base.base.rec_type)
75 return(-3);
76 if (info->rec_type > rec->rec.base.base.rec_type)
77 return(3);
81 * There is no special case for key. 0 means 0.
83 if (info->key < rec->rec.base.base.key)
84 return(-2);
85 if (info->key > rec->rec.base.base.key)
86 return(2);
89 * This test has a number of special cases. create_tid in key1 is
90 * the as-of transction id, and delete_tid in key1 is NOT USED.
92 * A key1->create_tid of 0 matches any record regardles of when
93 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
94 * used to search for the most current state of the object.
96 * key2->create_tid is a HAMMER record and will never be
97 * 0. key2->delete_tid is the deletion transaction id or 0 if
98 * the record has not yet been deleted.
100 if (info->create_tid) {
101 if (info->create_tid < rec->rec.base.base.create_tid)
102 return(-1);
103 if (rec->rec.base.base.delete_tid &&
104 info->create_tid >= rec->rec.base.base.delete_tid) {
105 return(1);
108 return(0);
112 * RB_SCAN comparison code for hammer_mem_search(). The argument order
113 * is reversed so the comparison result has to be negated. key_beg and
114 * key_end are both inclusive boundaries.
116 static
118 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
120 hammer_cursor_t cursor = data;
121 int r;
123 r = hammer_rec_compare(&cursor->key_beg, rec);
124 if (r > 0)
125 return(-1);
126 if (r == 0)
127 return(0);
128 r = hammer_rec_compare(&cursor->key_end, rec);
129 if (r <= 0)
130 return(1);
131 return(0);
134 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
135 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
136 hammer_rec_compare, hammer_base_elm_t);
139 * Allocate a record for the caller to finish filling in
141 hammer_record_t
142 hammer_alloc_mem_record(struct hammer_transaction *trans, hammer_inode_t ip)
144 hammer_record_t record;
146 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
147 record->ip = ip;
148 return (record);
152 * Release a memory record. If the record is marked for defered deletion,
153 * destroy the record when the last reference goes away.
155 void
156 hammer_rel_mem_record(struct hammer_record **recordp)
158 hammer_record_t rec;
160 if ((rec = *recordp) != NULL) {
161 if (hammer_islastref(&rec->lock)) {
162 hammer_unref(&rec->lock);
163 if (rec->flags & HAMMER_RECF_DELETED)
164 hammer_free_mem_record(rec);
165 } else {
166 hammer_unref(&rec->lock);
168 *recordp = NULL;
173 * Free a record. Clean the structure up even though we are throwing it
174 * away as a sanity check. The actual free operation is delayed while
175 * the record is referenced. However, the record is removed from the RB
176 * tree immediately.
178 void
179 hammer_free_mem_record(hammer_record_t record)
181 if (record->flags & HAMMER_RECF_ONRBTREE) {
182 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
183 record->flags &= ~HAMMER_RECF_ONRBTREE;
185 if (record->lock.refs) {
186 record->flags |= HAMMER_RECF_DELETED;
187 return;
189 if (record->flags & HAMMER_RECF_ALLOCDATA) {
190 kfree(record->data, M_HAMMER);
191 record->flags &= ~HAMMER_RECF_ALLOCDATA;
193 record->data = NULL;
194 kfree(record, M_HAMMER);
198 * Lookup an in-memory record given the key specified in the cursor. Works
199 * just like hammer_btree_lookup() but operates on an inode's in-memory
200 * record list.
202 * The lookup must fail if the record is marked for deferred deletion.
204 static
206 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
208 int error;
210 if (cursor->iprec)
211 hammer_rel_mem_record(&cursor->iprec);
212 if (cursor->ip) {
213 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
214 &cursor->ip->rec_tree);
216 cursor->ip = ip;
217 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
218 cursor->scan.node = NULL;
219 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
220 &ip->rec_tree, &cursor->key_beg);
221 if (cursor->iprec == NULL) {
222 error = ENOENT;
223 } else {
224 hammer_ref(&cursor->iprec->lock);
225 error = 0;
227 return(error);
231 * hammer_mem_search() - locate the first in-memory record matching the
232 * cursor.
234 * The RB_SCAN function we use is designed as a callback. We terminate it
235 * (return -1) as soon as we get a match.
237 static
239 hammer_rec_scan_callback(hammer_record_t rec, void *data)
241 hammer_cursor_t cursor = data;
243 if (cursor->iprec == NULL) {
244 cursor->iprec = rec;
245 hammer_ref(&rec->lock);
246 return(-1);
248 return(0);
251 static
253 hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip)
255 if (cursor->iprec)
256 hammer_rel_mem_record(&cursor->iprec);
257 if (cursor->ip) {
258 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
259 &cursor->ip->rec_tree);
261 cursor->ip = ip;
262 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
263 cursor->scan.node = NULL;
264 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265 hammer_rec_scan_callback, cursor);
266 if (cursor->iprec) {
267 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
268 return(0);
270 return(ENOENT);
273 void
274 hammer_mem_done(hammer_cursor_t cursor)
276 if (cursor->ip) {
277 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
278 &cursor->ip->rec_tree);
279 cursor->ip = NULL;
281 if (cursor->iprec)
282 hammer_rel_mem_record(&cursor->iprec);
285 /************************************************************************
286 * HAMMER IN-MEMORY RECORD FUNCTIONS *
287 ************************************************************************
289 * These functions manipulate in-memory records. Such records typically
290 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
294 * Add a directory entry (dip,ncp) which references inode (ip).
296 * Note that the low 32 bits of the namekey are set temporarily to create
297 * a unique in-memory record, and may be modified a second time when the
298 * record is synchronized to disk. In particular, the low 32 bits cannot be
299 * all 0's when synching to disk, which is not handled here.
302 hammer_ip_add_directory(struct hammer_transaction *trans,
303 struct hammer_inode *dip, struct namecache *ncp,
304 struct hammer_inode *ip)
306 hammer_record_t record;
307 int error;
308 int bytes;
310 record = hammer_alloc_mem_record(trans, dip);
312 kprintf("add to directory dip %p\n", dip);
313 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
314 if (++trans->hmp->namekey_iterator == 0)
315 ++trans->hmp->namekey_iterator;
317 record->rec.entry.base.base.obj_id = dip->obj_id;
318 record->rec.entry.base.base.key =
319 hammer_directory_namekey(ncp->nc_name, bytes);
320 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
321 record->rec.entry.base.base.create_tid = trans->tid;
322 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
323 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
324 record->rec.entry.obj_id = ip->obj_id;
325 if (bytes <= sizeof(record->rec.entry.den_name)) {
326 record->data = (void *)record->rec.entry.den_name;
327 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
328 } else {
329 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
330 record->flags |= HAMMER_RECF_ALLOCDATA;
332 bcopy(ncp->nc_name, record->data, bytes);
333 record->rec.entry.base.data_len = bytes;
334 ++ip->ino_rec.ino_nlinks;
335 hammer_modify_inode(trans, ip,
336 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
337 error = hammer_mem_add(trans, record);
338 return(error);
342 * Delete the directory entry and update the inode link count. The
343 * cursor must be seeked to the directory entry record being deleted.
345 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
348 hammer_ip_del_directory(struct hammer_transaction *trans,
349 hammer_cursor_t cursor, struct hammer_inode *dip,
350 struct hammer_inode *ip)
352 int error;
354 if (cursor->record == &cursor->iprec->rec) {
356 * The directory entry was in-memory, just scrap the
357 * record.
359 hammer_free_mem_record(cursor->iprec);
360 error = 0;
361 } else {
363 * The directory entry was on-disk, mark the record and
364 * B-Tree entry as deleted. The B-Tree entry does not
365 * have to be reindexed because a 'current' delete transid
366 * will wind up in the same position as the live record.
368 KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
369 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
370 if (error == 0) {
371 cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
372 cursor->record->base.base.delete_tid = trans->tid;
373 hammer_modify_node(cursor->node);
374 hammer_modify_buffer(cursor->record_buffer);
379 * One less link. The file may still be open in the OS even after
380 * all links have gone away so we don't destroy the inode's data
381 * here.
383 if (error == 0) {
384 --ip->ino_rec.ino_nlinks;
385 hammer_modify_inode(trans, ip,
386 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
388 return(error);
392 * Sync data from a buffer cache buffer (typically) to the filesystem. This
393 * is called via the strategy called from a cached data source. This code
394 * is responsible for actually writing a data record out to the disk.
397 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
398 int64_t offset, void *data, int bytes)
400 struct hammer_cursor cursor;
401 hammer_record_ondisk_t rec;
402 union hammer_btree_elm elm;
403 void *bdata;
404 int error;
406 error = hammer_init_cursor_ip(&cursor, ip);
407 if (error)
408 return(error);
409 cursor.key_beg.obj_id = ip->obj_id;
410 cursor.key_beg.key = offset + bytes;
411 cursor.key_beg.create_tid = trans->tid;
412 cursor.key_beg.delete_tid = 0;
413 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
414 cursor.flags = HAMMER_CURSOR_INSERT;
417 * Issue a lookup to position the cursor and locate the cluster
419 error = hammer_btree_lookup(&cursor);
420 if (error == 0) {
421 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
422 offset, bytes);
423 error = EIO;
425 if (error != ENOENT)
426 goto done;
429 * Allocate record and data space now that we know which cluster
430 * the B-Tree node ended up in.
432 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
433 &cursor.data_buffer);
434 if (bdata == NULL)
435 goto done;
436 rec = hammer_alloc_record(cursor.node->cluster, &error,
437 &cursor.record_buffer);
438 if (rec == NULL)
439 goto fail1;
442 * Fill everything in and insert our B-Tree node.
444 rec->base.base = cursor.key_beg;
445 rec->base.data_crc = crc32(data, bytes);
446 rec->base.rec_id = 0; /* XXX */
447 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
448 rec->base.data_len = bytes;
449 hammer_modify_buffer(cursor.record_buffer);
451 bcopy(data, bdata, bytes);
452 hammer_modify_buffer(cursor.data_buffer);
454 elm.leaf.base = cursor.key_beg;
455 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
456 elm.leaf.data_offset = rec->base.data_offset;
457 elm.leaf.data_len = bytes;
458 elm.leaf.data_crc = rec->base.data_crc;
460 error = hammer_btree_insert(&cursor, &elm);
461 if (error == 0)
462 goto done;
464 hammer_free_record_ptr(cursor.record_buffer, rec);
465 fail1:
466 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
467 done:
468 hammer_done_cursor(&cursor);
469 return(error);
473 * Sync an in-memory record to the disk. this is typically called via fsync
474 * from a cached record source. This code is responsible for actually
475 * writing a record out to the disk.
478 hammer_ip_sync_record(hammer_record_t record)
480 struct hammer_cursor cursor;
481 hammer_record_ondisk_t rec;
482 union hammer_btree_elm elm;
483 void *bdata;
484 int error;
486 error = hammer_init_cursor_ip(&cursor, record->ip);
487 if (error)
488 return(error);
489 cursor.key_beg = record->rec.base.base;
490 cursor.flags = HAMMER_CURSOR_INSERT;
493 * Issue a lookup to position the cursor and locate the cluster
495 error = hammer_btree_lookup(&cursor);
496 if (error == 0) {
497 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
498 record->rec.base.base.key);
499 error = EIO;
501 if (error != ENOENT)
502 goto done;
505 * Allocate record and data space now that we know which cluster
506 * the B-Tree node ended up in.
508 if (record->data == NULL ||
509 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
510 bdata = record->data;
511 } else {
512 bdata = hammer_alloc_data(cursor.node->cluster,
513 record->rec.base.data_len, &error,
514 &cursor.data_buffer);
515 if (bdata == NULL)
516 goto done;
518 rec = hammer_alloc_record(cursor.node->cluster, &error,
519 &cursor.record_buffer);
520 if (rec == NULL)
521 goto fail1;
524 * Fill everything in and insert our B-Tree node.
526 * XXX assign rec_id here
528 *rec = record->rec;
529 kprintf("record->rec %p data %p\n", &record->rec, record->data);
530 if (bdata) {
531 rec->base.data_crc = crc32(record->data,
532 record->rec.base.data_len);
533 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
535 * Data embedded in record
537 rec->base.data_offset = ((char *)bdata -
538 (char *)&record->rec);
539 KKASSERT(rec->base.data_offset >= 0 &&
540 rec->base.data_offset + rec->base.data_len <
541 sizeof(*rec));
542 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
543 } else {
545 * Data separate from record
547 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
548 bcopy(record->data, bdata, rec->base.data_len);
549 hammer_modify_buffer(cursor.data_buffer);
552 rec->base.rec_id = 0; /* XXX */
554 hammer_modify_buffer(cursor.record_buffer);
556 elm.leaf.base = cursor.key_beg;
557 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
558 elm.leaf.data_offset = rec->base.data_offset;
559 elm.leaf.data_len = rec->base.data_len;
560 elm.leaf.data_crc = rec->base.data_crc;
562 error = hammer_btree_insert(&cursor, &elm);
563 if (error == 0)
564 goto done;
566 hammer_free_record_ptr(cursor.record_buffer, rec);
567 fail1:
568 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
569 hammer_free_data_ptr(cursor.data_buffer, bdata,
570 rec->base.data_len);
572 done:
573 hammer_done_cursor(&cursor);
574 kprintf("hammer_ip_sync_record_done %d\n", error);
575 return(error);
580 * Add the record to the inode's rec_tree. The low 32 bits of a directory
581 * entry's key is used to deal with hash collisions in the upper 32 bits.
582 * A unique 64 bit key is generated in-memory and may be regenerated a
583 * second time when the directory record is flushed to the on-disk B-Tree.
585 static
587 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
589 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
590 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
591 hammer_free_mem_record(record);
592 return (EEXIST);
594 if (++trans->hmp->namekey_iterator == 0)
595 ++trans->hmp->namekey_iterator;
596 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
597 record->rec.base.base.key |= trans->hmp->namekey_iterator;
599 record->flags |= HAMMER_RECF_ONRBTREE;
600 return(0);
603 /************************************************************************
604 * HAMMER INODE MERGED-RECORD FUNCTIONS *
605 ************************************************************************
607 * These functions augment the B-Tree scanning functions in hammer_btree.c
608 * by merging in-memory records with on-disk records.
612 * Locate a particular record either in-memory or on-disk.
614 * NOTE: This is basically a standalone routine, hammer_ip_next() may
615 * NOT be called to iterate results.
618 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
620 int error;
623 * If the element is in-memory return it without searching the
624 * on-disk B-Tree
626 error = hammer_mem_lookup(cursor, ip);
627 if (error == 0) {
628 cursor->record = &cursor->iprec->rec;
629 return(error);
631 if (error != ENOENT)
632 return(error);
635 * If the inode has on-disk components search the on-disk B-Tree.
637 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
638 return(error);
639 error = hammer_btree_lookup(cursor);
640 if (error == 0)
641 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
642 return(error);
646 * Locate the first record within the cursor's key_beg/key_end range,
647 * restricted to a particular inode. 0 is returned on success, ENOENT
648 * if no records matched the requested range, or some other error.
650 * When 0 is returned hammer_ip_next() may be used to iterate additional
651 * records within the requested range.
654 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
656 int error;
659 * Clean up fields and setup for merged scan
661 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
662 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
663 if (cursor->iprec)
664 hammer_rel_mem_record(&cursor->iprec);
667 * Search the on-disk B-Tree. hammer_btree_lookup() only does an
668 * exact lookup so if we get ENOENT we have to call the iterate
669 * function to validate the first record after the begin key.
671 * The ATEDISK flag is used by hammer_btree_iterate to determine
672 * whether it must index forwards or not.
674 if (ip->flags & HAMMER_INODE_ONDISK) {
675 error = hammer_btree_lookup(cursor);
676 if (error == ENOENT) {
677 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
678 error = hammer_btree_iterate(cursor);
680 if (error && error != ENOENT)
681 return(error);
682 if (error == 0) {
683 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
684 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
685 } else {
686 cursor->flags |= HAMMER_CURSOR_ATEDISK;
691 * Search the in-memory record list (Red-Black tree). Unlike the
692 * B-Tree search, mem_search checks for records in the range.
694 error = hammer_mem_search(cursor, ip);
695 if (error && error != ENOENT)
696 return(error);
697 if (error == 0) {
698 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
699 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
703 * This will return the first matching record.
705 return(hammer_ip_next(cursor));
709 * Retrieve the next record in a merged iteration within the bounds of the
710 * cursor. This call may be made multiple times after the cursor has been
711 * initially searched with hammer_ip_first().
713 * 0 is returned on success, ENOENT if no further records match the
714 * requested range, or some other error code is returned.
717 hammer_ip_next(hammer_cursor_t cursor)
719 hammer_btree_elm_t elm;
720 hammer_record_t rec;
721 int error;
722 int r;
725 * Load the current on-disk and in-memory record. If we ate any
726 * records we have to get the next one.
728 * Get the next on-disk record
730 if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
731 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
732 error = hammer_btree_iterate(cursor);
733 if (error == 0)
734 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
735 else
736 cursor->flags |= HAMMER_CURSOR_DISKEOF;
741 * Get the next in-memory record. The record can be ripped out
742 * of the RB tree so we maintain a scan_info structure to track
743 * the next node.
745 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
746 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
747 rec = cursor->scan.node; /* next node */
748 if (rec) {
749 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
750 hammer_ref(&rec->lock);
751 cursor->scan.node =
752 hammer_rec_rb_tree_RB_NEXT(rec);
753 } else {
754 cursor->flags |= HAMMER_CURSOR_MEMEOF;
756 hammer_rel_mem_record(&cursor->iprec);
757 cursor->iprec = rec;
762 * Extract either the disk or memory record depending on their
763 * relative position.
765 error = 0;
766 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
767 case 0:
769 * Both entries valid
771 elm = &cursor->node->ondisk->elms[cursor->index];
772 r = hammer_btree_cmp(&elm->base,
773 &cursor->iprec->rec.base.base);
774 if (r < 0) {
775 error = hammer_btree_extract(cursor,
776 HAMMER_CURSOR_GET_RECORD);
777 cursor->flags |= HAMMER_CURSOR_ATEDISK;
778 break;
780 /* fall through to the memory entry */
781 case HAMMER_CURSOR_ATEDISK:
783 * Only the memory entry is valid
785 cursor->record = &cursor->iprec->rec;
786 cursor->flags |= HAMMER_CURSOR_ATEMEM;
787 break;
788 case HAMMER_CURSOR_ATEMEM:
790 * Only the disk entry is valid
792 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
793 cursor->flags |= HAMMER_CURSOR_ATEDISK;
794 break;
795 default:
797 * Neither entry is valid
799 * XXX error not set properly
801 cursor->record = NULL;
802 error = ENOENT;
803 break;
805 return(error);
809 * Resolve the cursor->data pointer for the current cursor position in
810 * a merged iteration.
813 hammer_ip_resolve_data(hammer_cursor_t cursor)
815 int error;
817 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
818 cursor->data = cursor->iprec->data;
819 error = 0;
820 } else {
821 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
823 return(error);
827 * Delete all records within the specified range for inode ip.
829 * NOTE: An unaligned range will cause new records to be added to cover
830 * the edge cases.
832 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
835 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
836 int64_t ran_beg, int64_t ran_end)
838 struct hammer_cursor cursor;
839 hammer_record_ondisk_t rec;
840 hammer_base_elm_t base;
841 int error;
842 int64_t off;
844 hammer_init_cursor_ip(&cursor, ip);
846 cursor.key_beg.obj_id = ip->obj_id;
847 cursor.key_beg.create_tid = ip->obj_asof;
848 cursor.key_beg.delete_tid = 0;
849 cursor.key_beg.obj_type = 0;
850 cursor.key_beg.key = ran_beg;
851 cursor.key_end = cursor.key_beg;
852 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
853 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
854 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
855 cursor.key_end.key = ran_end;
856 } else {
857 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
858 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
859 if (ran_end + MAXPHYS < ran_end)
860 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
861 else
862 cursor.key_end.key = ran_end + MAXPHYS;
865 error = hammer_ip_first(&cursor, ip);
868 * Iterate through matching records and mark them as deleted.
870 while (error == 0) {
871 rec = cursor.record;
872 base = &rec->base.base;
874 KKASSERT(base->delete_tid == 0);
877 * There may be overlap cases for regular file data. Also
878 * remember the key for a regular file record is the offset
879 * of the last byte of the record (base + len - 1), NOT the
880 * base offset.
882 if (base->rec_type == HAMMER_RECTYPE_DATA) {
883 off = base->key - rec->base.data_len + 1;
885 * Check the left edge case
887 if (off < ran_beg) {
888 panic("hammer left edge case\n");
892 * Check the right edge case. Note that the
893 * record can be completely out of bounds, which
894 * terminates the search.
896 * base->key is (base_offset + bytes - 1), ran_end
897 * works the same way.
899 if (base->key > ran_end) {
900 if (base->key - rec->base.data_len + 1 > ran_end) {
901 kprintf("right edge OOB\n");
902 break;
904 panic("hammer right edge case\n");
909 * Mark the record and B-Tree entry as deleted
911 if (cursor.record == &cursor.iprec->rec) {
912 hammer_free_mem_record(cursor.iprec);
914 } else {
915 cursor.node->ondisk->
916 elms[cursor.index].base.delete_tid = trans->tid;
917 cursor.record->base.base.delete_tid = trans->tid;
918 hammer_modify_node(cursor.node);
919 hammer_modify_buffer(cursor.record_buffer);
921 error = hammer_ip_next(&cursor);
923 hammer_done_cursor(&cursor);
924 if (error == ENOENT)
925 error = 0;
926 return(error);