HAMMER 38F/Many: Undo/Synchronization and crash recovery, stabilization pass
[dfdiff.git] / sys / vfs / hammer / hammer_vnops.c
blobad311a63525fba5c4890ad489d02c092e1d59cf4
1 /*
2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.42 2008/04/27 21:07:15 dillon Exp $
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
52 * USERFS VNOPS
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
91 struct vop_ops hammer_vnode_vops = {
92 .vop_default = vop_defaultop,
93 .vop_fsync = hammer_vop_fsync,
94 .vop_getpages = vop_stdgetpages,
95 .vop_putpages = vop_stdputpages,
96 .vop_read = hammer_vop_read,
97 .vop_write = hammer_vop_write,
98 .vop_access = hammer_vop_access,
99 .vop_advlock = hammer_vop_advlock,
100 .vop_close = hammer_vop_close,
101 .vop_ncreate = hammer_vop_ncreate,
102 .vop_getattr = hammer_vop_getattr,
103 .vop_inactive = hammer_vop_inactive,
104 .vop_reclaim = hammer_vop_reclaim,
105 .vop_nresolve = hammer_vop_nresolve,
106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
107 .vop_nlink = hammer_vop_nlink,
108 .vop_nmkdir = hammer_vop_nmkdir,
109 .vop_nmknod = hammer_vop_nmknod,
110 .vop_open = hammer_vop_open,
111 .vop_pathconf = hammer_vop_pathconf,
112 .vop_print = hammer_vop_print,
113 .vop_readdir = hammer_vop_readdir,
114 .vop_readlink = hammer_vop_readlink,
115 .vop_nremove = hammer_vop_nremove,
116 .vop_nrename = hammer_vop_nrename,
117 .vop_nrmdir = hammer_vop_nrmdir,
118 .vop_setattr = hammer_vop_setattr,
119 .vop_strategy = hammer_vop_strategy,
120 .vop_nsymlink = hammer_vop_nsymlink,
121 .vop_nwhiteout = hammer_vop_nwhiteout,
122 .vop_ioctl = hammer_vop_ioctl,
123 .vop_mountctl = hammer_vop_mountctl
126 struct vop_ops hammer_spec_vops = {
127 .vop_default = spec_vnoperate,
128 .vop_fsync = hammer_vop_fsync,
129 .vop_read = hammer_vop_specread,
130 .vop_write = hammer_vop_specwrite,
131 .vop_access = hammer_vop_access,
132 .vop_close = hammer_vop_specclose,
133 .vop_getattr = hammer_vop_getattr,
134 .vop_inactive = hammer_vop_inactive,
135 .vop_reclaim = hammer_vop_reclaim,
136 .vop_setattr = hammer_vop_setattr
139 struct vop_ops hammer_fifo_vops = {
140 .vop_default = fifo_vnoperate,
141 .vop_fsync = hammer_vop_fsync,
142 .vop_read = hammer_vop_fiforead,
143 .vop_write = hammer_vop_fifowrite,
144 .vop_access = hammer_vop_access,
145 .vop_close = hammer_vop_fifoclose,
146 .vop_getattr = hammer_vop_getattr,
147 .vop_inactive = hammer_vop_inactive,
148 .vop_reclaim = hammer_vop_reclaim,
149 .vop_setattr = hammer_vop_setattr
152 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
153 struct vnode *dvp, struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
157 #if 0
158 static
160 hammer_vop_vnoperate(struct vop_generic_args *)
162 return (VOCALL(&hammer_vnode_vops, ap));
164 #endif
167 * hammer_vop_fsync { vp, waitfor }
169 static
171 hammer_vop_fsync(struct vop_fsync_args *ap)
173 hammer_inode_t ip = VTOI(ap->a_vp);
175 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
176 if (ap->a_waitfor == MNT_WAIT)
177 hammer_wait_inode(ip);
178 return (ip->error);
182 * hammer_vop_read { vp, uio, ioflag, cred }
184 static
186 hammer_vop_read(struct vop_read_args *ap)
188 struct hammer_transaction trans;
189 hammer_inode_t ip;
190 off_t offset;
191 struct buf *bp;
192 struct uio *uio;
193 int error;
194 int n;
195 int seqcount;
197 if (ap->a_vp->v_type != VREG)
198 return (EINVAL);
199 ip = VTOI(ap->a_vp);
200 error = 0;
201 seqcount = ap->a_ioflag >> 16;
203 hammer_start_transaction(&trans, ip->hmp);
206 * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
208 uio = ap->a_uio;
209 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
210 offset = uio->uio_offset & HAMMER_BUFMASK;
211 #if 0
212 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
213 uio->uio_offset - offset, HAMMER_BUFSIZE,
214 MAXBSIZE, seqcount, &bp);
215 #endif
216 error = bread(ap->a_vp, uio->uio_offset - offset,
217 HAMMER_BUFSIZE, &bp);
218 if (error) {
219 brelse(bp);
220 break;
222 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
223 n = HAMMER_BUFSIZE - offset;
224 if (n > uio->uio_resid)
225 n = uio->uio_resid;
226 if (n > ip->ino_rec.ino_size - uio->uio_offset)
227 n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
228 error = uiomove((char *)bp->b_data + offset, n, uio);
229 if (error) {
230 bqrelse(bp);
231 break;
233 bqrelse(bp);
235 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
236 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
237 ip->ino_rec.ino_atime = trans.time;
238 hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
240 hammer_done_transaction(&trans);
241 return (error);
245 * hammer_vop_write { vp, uio, ioflag, cred }
247 static
249 hammer_vop_write(struct vop_write_args *ap)
251 struct hammer_transaction trans;
252 struct hammer_inode *ip;
253 struct uio *uio;
254 off_t offset;
255 struct buf *bp;
256 int error;
257 int n;
258 int flags;
259 int count;
261 if (ap->a_vp->v_type != VREG)
262 return (EINVAL);
263 ip = VTOI(ap->a_vp);
264 error = 0;
266 if (ip->flags & HAMMER_INODE_RO)
267 return (EROFS);
270 * Create a transaction to cover the operations we perform.
272 hammer_start_transaction(&trans, ip->hmp);
273 uio = ap->a_uio;
276 * Check append mode
278 if (ap->a_ioflag & IO_APPEND)
279 uio->uio_offset = ip->ino_rec.ino_size;
282 * Check for illegal write offsets. Valid range is 0...2^63-1
284 if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
285 hammer_done_transaction(&trans);
286 return (EFBIG);
290 * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
292 count = 0;
293 while (uio->uio_resid > 0) {
294 int fixsize = 0;
297 * Do not allow huge writes to deadlock the buffer cache
299 if ((++count & 15) == 0) {
300 vn_unlock(ap->a_vp);
301 if ((ap->a_ioflag & IO_NOBWILL) == 0)
302 bwillwrite();
303 vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
306 offset = uio->uio_offset & HAMMER_BUFMASK;
307 n = HAMMER_BUFSIZE - offset;
308 if (n > uio->uio_resid)
309 n = uio->uio_resid;
310 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
311 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
312 fixsize = 1;
315 if (uio->uio_segflg == UIO_NOCOPY) {
317 * Issuing a write with the same data backing the
318 * buffer. Instantiate the buffer to collect the
319 * backing vm pages, then read-in any missing bits.
321 * This case is used by vop_stdputpages().
323 bp = getblk(ap->a_vp, uio->uio_offset - offset,
324 HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
325 if ((bp->b_flags & B_CACHE) == 0) {
326 bqrelse(bp);
327 error = bread(ap->a_vp,
328 uio->uio_offset - offset,
329 HAMMER_BUFSIZE, &bp);
331 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
333 * Even though we are entirely overwriting the buffer
334 * we may still have to zero it out to avoid a
335 * mmap/write visibility issue.
337 bp = getblk(ap->a_vp, uio->uio_offset - offset,
338 HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
339 if ((bp->b_flags & B_CACHE) == 0)
340 vfs_bio_clrbuf(bp);
341 } else if (uio->uio_offset - offset >= ip->ino_rec.ino_size) {
343 * If the base offset of the buffer is beyond the
344 * file EOF, we don't have to issue a read.
346 bp = getblk(ap->a_vp, uio->uio_offset - offset,
347 HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
348 vfs_bio_clrbuf(bp);
349 } else {
351 * Partial overwrite, read in any missing bits then
352 * replace the portion being written.
354 error = bread(ap->a_vp, uio->uio_offset - offset,
355 HAMMER_BUFSIZE, &bp);
356 if (error == 0)
357 bheavy(bp);
359 if (error == 0)
360 error = uiomove((char *)bp->b_data + offset, n, uio);
363 * If we screwed up we have to undo any VM size changes we
364 * made.
366 if (error) {
367 brelse(bp);
368 if (fixsize) {
369 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
370 HAMMER_BUFSIZE);
372 break;
374 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
375 hammer_lock_sh(&ip->lock);
376 if (ip->ino_rec.ino_size < uio->uio_offset) {
377 ip->ino_rec.ino_size = uio->uio_offset;
378 flags = HAMMER_INODE_RDIRTY;
379 vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
380 } else {
381 flags = 0;
383 ip->ino_rec.ino_mtime = trans.time;
384 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
385 hammer_modify_inode(&trans, ip, flags);
386 hammer_unlock(&ip->lock);
388 if (ap->a_ioflag & IO_SYNC) {
389 bwrite(bp);
390 } else if (ap->a_ioflag & IO_DIRECT) {
391 bawrite(bp);
392 #if 0
393 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
394 (uio->uio_offset & HAMMER_BUFMASK) == 0) {
396 * XXX HAMMER can only fsync the whole inode,
397 * doing it on every buffer would be a bad idea.
400 * If seqcount indicates sequential operation and
401 * we just finished filling a buffer, push it out
402 * now to prevent the buffer cache from becoming
403 * too full, which would trigger non-optimal
404 * flushes.
406 bdwrite(bp);
407 #endif
408 } else {
409 bdwrite(bp);
412 hammer_done_transaction(&trans);
413 return (error);
417 * hammer_vop_access { vp, mode, cred }
419 static
421 hammer_vop_access(struct vop_access_args *ap)
423 struct hammer_inode *ip = VTOI(ap->a_vp);
424 uid_t uid;
425 gid_t gid;
426 int error;
428 uid = hammer_to_unix_xid(&ip->ino_data.uid);
429 gid = hammer_to_unix_xid(&ip->ino_data.gid);
431 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
432 ip->ino_data.uflags);
433 return (error);
437 * hammer_vop_advlock { vp, id, op, fl, flags }
439 static
441 hammer_vop_advlock(struct vop_advlock_args *ap)
443 struct hammer_inode *ip = VTOI(ap->a_vp);
445 return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
449 * hammer_vop_close { vp, fflag }
451 static
453 hammer_vop_close(struct vop_close_args *ap)
455 return (vop_stdclose(ap));
459 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
461 * The operating system has already ensured that the directory entry
462 * does not exist and done all appropriate namespace locking.
464 static
466 hammer_vop_ncreate(struct vop_ncreate_args *ap)
468 struct hammer_transaction trans;
469 struct hammer_inode *dip;
470 struct hammer_inode *nip;
471 struct nchandle *nch;
472 int error;
474 nch = ap->a_nch;
475 dip = VTOI(ap->a_dvp);
477 if (dip->flags & HAMMER_INODE_RO)
478 return (EROFS);
481 * Create a transaction to cover the operations we perform.
483 hammer_start_transaction(&trans, dip->hmp);
486 * Create a new filesystem object of the requested type. The
487 * returned inode will be referenced and shared-locked to prevent
488 * it from being moved to the flusher.
491 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
492 if (error) {
493 kprintf("hammer_create_inode error %d\n", error);
494 hammer_done_transaction(&trans);
495 *ap->a_vpp = NULL;
496 return (error);
498 hammer_lock_sh(&nip->lock);
499 hammer_lock_sh(&dip->lock);
502 * Add the new filesystem object to the directory. This will also
503 * bump the inode's link count.
505 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
506 hammer_finalize_inode(&trans, nip, error);
507 if (error)
508 kprintf("hammer_ip_add_directory error %d\n", error);
509 hammer_unlock(&dip->lock);
510 hammer_unlock(&nip->lock);
513 * Finish up.
515 if (error) {
516 hammer_rel_inode(nip, 0);
517 hammer_done_transaction(&trans);
518 *ap->a_vpp = NULL;
519 } else {
520 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
521 hammer_done_transaction(&trans);
522 hammer_rel_inode(nip, 0);
523 if (error == 0) {
524 cache_setunresolved(ap->a_nch);
525 cache_setvp(ap->a_nch, *ap->a_vpp);
528 return (error);
532 * hammer_vop_getattr { vp, vap }
534 static
536 hammer_vop_getattr(struct vop_getattr_args *ap)
538 struct hammer_inode *ip = VTOI(ap->a_vp);
539 struct vattr *vap = ap->a_vap;
541 #if 0
542 if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
543 (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
544 ip->obj_asof == XXX
546 /* LAZYMOD XXX */
548 hammer_itimes(ap->a_vp);
549 #endif
551 vap->va_fsid = ip->hmp->fsid_udev;
552 vap->va_fileid = ip->ino_rec.base.base.obj_id;
553 vap->va_mode = ip->ino_data.mode;
554 vap->va_nlink = ip->ino_rec.ino_nlinks;
555 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
556 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
557 vap->va_rmajor = 0;
558 vap->va_rminor = 0;
559 vap->va_size = ip->ino_rec.ino_size;
560 hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
561 hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
562 hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
563 vap->va_flags = ip->ino_data.uflags;
564 vap->va_gen = 1; /* hammer inums are unique for all time */
565 vap->va_blocksize = HAMMER_BUFSIZE;
566 vap->va_bytes = (ip->ino_rec.ino_size + 63) & ~63;
567 vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
568 vap->va_filerev = 0; /* XXX */
569 /* mtime uniquely identifies any adjustments made to the file */
570 vap->va_fsmid = ip->ino_rec.ino_mtime;
571 vap->va_uid_uuid = ip->ino_data.uid;
572 vap->va_gid_uuid = ip->ino_data.gid;
573 vap->va_fsid_uuid = ip->hmp->fsid;
574 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
575 VA_FSID_UUID_VALID;
577 switch (ip->ino_rec.base.base.obj_type) {
578 case HAMMER_OBJTYPE_CDEV:
579 case HAMMER_OBJTYPE_BDEV:
580 vap->va_rmajor = ip->ino_data.rmajor;
581 vap->va_rminor = ip->ino_data.rminor;
582 break;
583 default:
584 break;
587 return(0);
591 * hammer_vop_nresolve { nch, dvp, cred }
593 * Locate the requested directory entry.
595 static
597 hammer_vop_nresolve(struct vop_nresolve_args *ap)
599 struct hammer_transaction trans;
600 struct namecache *ncp;
601 hammer_inode_t dip;
602 hammer_inode_t ip;
603 hammer_tid_t asof;
604 struct hammer_cursor cursor;
605 union hammer_record_ondisk *rec;
606 struct vnode *vp;
607 int64_t namekey;
608 int error;
609 int i;
610 int nlen;
611 int flags;
612 u_int64_t obj_id;
615 * Misc initialization, plus handle as-of name extensions. Look for
616 * the '@@' extension. Note that as-of files and directories cannot
617 * be modified.
619 dip = VTOI(ap->a_dvp);
620 ncp = ap->a_nch->ncp;
621 asof = dip->obj_asof;
622 nlen = ncp->nc_nlen;
623 flags = dip->flags;
625 hammer_simple_transaction(&trans, dip->hmp);
627 for (i = 0; i < nlen; ++i) {
628 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
629 asof = hammer_str_to_tid(ncp->nc_name + i + 2);
630 flags |= HAMMER_INODE_RO;
631 break;
634 nlen = i;
637 * If there is no path component the time extension is relative to
638 * dip.
640 if (nlen == 0) {
641 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
642 asof, flags, &error);
643 if (error == 0) {
644 error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
645 hammer_rel_inode(ip, 0);
646 } else {
647 vp = NULL;
649 if (error == 0) {
650 vn_unlock(vp);
651 cache_setvp(ap->a_nch, vp);
652 vrele(vp);
654 goto done;
658 * Calculate the namekey and setup the key range for the scan. This
659 * works kinda like a chained hash table where the lower 32 bits
660 * of the namekey synthesize the chain.
662 * The key range is inclusive of both key_beg and key_end.
664 namekey = hammer_directory_namekey(ncp->nc_name, nlen);
666 error = hammer_init_cursor(&trans, &cursor, &dip->cache[0]);
667 cursor.key_beg.obj_id = dip->obj_id;
668 cursor.key_beg.key = namekey;
669 cursor.key_beg.create_tid = 0;
670 cursor.key_beg.delete_tid = 0;
671 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
672 cursor.key_beg.obj_type = 0;
674 cursor.key_end = cursor.key_beg;
675 cursor.key_end.key |= 0xFFFFFFFFULL;
676 cursor.asof = asof;
677 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
680 * Scan all matching records (the chain), locate the one matching
681 * the requested path component.
683 * The hammer_ip_*() functions merge in-memory records with on-disk
684 * records for the purposes of the search.
686 if (error == 0)
687 error = hammer_ip_first(&cursor, dip);
689 rec = NULL;
690 obj_id = 0;
692 while (error == 0) {
693 error = hammer_ip_resolve_data(&cursor);
694 if (error)
695 break;
696 rec = cursor.record;
697 if (nlen == rec->entry.base.data_len &&
698 bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
699 obj_id = rec->entry.obj_id;
700 break;
702 error = hammer_ip_next(&cursor);
704 hammer_done_cursor(&cursor);
705 if (error == 0) {
706 ip = hammer_get_inode(&trans, &dip->cache[1],
707 obj_id, asof, flags, &error);
708 if (error == 0) {
709 error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
710 hammer_rel_inode(ip, 0);
711 } else {
712 vp = NULL;
714 if (error == 0) {
715 vn_unlock(vp);
716 cache_setvp(ap->a_nch, vp);
717 vrele(vp);
719 } else if (error == ENOENT) {
720 cache_setvp(ap->a_nch, NULL);
722 done:
723 hammer_done_transaction(&trans);
724 return (error);
728 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
730 * Locate the parent directory of a directory vnode.
732 * dvp is referenced but not locked. *vpp must be returned referenced and
733 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
734 * at the root, instead it could indicate that the directory we were in was
735 * removed.
737 * NOTE: as-of sequences are not linked into the directory structure. If
738 * we are at the root with a different asof then the mount point, reload
739 * the same directory with the mount point's asof. I'm not sure what this
740 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
741 * get confused, but it hasn't been tested.
743 static
745 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
747 struct hammer_transaction trans;
748 struct hammer_inode *dip;
749 struct hammer_inode *ip;
750 int64_t parent_obj_id;
751 hammer_tid_t asof;
752 int error;
754 dip = VTOI(ap->a_dvp);
755 asof = dip->obj_asof;
756 parent_obj_id = dip->ino_data.parent_obj_id;
758 if (parent_obj_id == 0) {
759 if (dip->obj_id == HAMMER_OBJID_ROOT &&
760 asof != dip->hmp->asof) {
761 parent_obj_id = dip->obj_id;
762 asof = dip->hmp->asof;
763 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
764 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
765 dip->obj_asof);
766 } else {
767 *ap->a_vpp = NULL;
768 return ENOENT;
772 hammer_simple_transaction(&trans, dip->hmp);
774 ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
775 asof, dip->flags, &error);
776 if (ip) {
777 error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
778 hammer_rel_inode(ip, 0);
779 } else {
780 *ap->a_vpp = NULL;
782 hammer_done_transaction(&trans);
783 return (error);
787 * hammer_vop_nlink { nch, dvp, vp, cred }
789 static
791 hammer_vop_nlink(struct vop_nlink_args *ap)
793 struct hammer_transaction trans;
794 struct hammer_inode *dip;
795 struct hammer_inode *ip;
796 struct nchandle *nch;
797 int error;
799 nch = ap->a_nch;
800 dip = VTOI(ap->a_dvp);
801 ip = VTOI(ap->a_vp);
803 if (dip->flags & HAMMER_INODE_RO)
804 return (EROFS);
805 if (ip->flags & HAMMER_INODE_RO)
806 return (EROFS);
809 * Create a transaction to cover the operations we perform.
811 hammer_start_transaction(&trans, dip->hmp);
814 * Add the filesystem object to the directory. Note that neither
815 * dip nor ip are referenced or locked, but their vnodes are
816 * referenced. This function will bump the inode's link count.
818 hammer_lock_sh(&ip->lock);
819 hammer_lock_sh(&dip->lock);
820 error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
821 hammer_unlock(&dip->lock);
822 hammer_unlock(&ip->lock);
825 * Finish up.
827 if (error == 0) {
828 cache_setunresolved(nch);
829 cache_setvp(nch, ap->a_vp);
831 hammer_done_transaction(&trans);
832 return (error);
836 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
838 * The operating system has already ensured that the directory entry
839 * does not exist and done all appropriate namespace locking.
841 static
843 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
845 struct hammer_transaction trans;
846 struct hammer_inode *dip;
847 struct hammer_inode *nip;
848 struct nchandle *nch;
849 int error;
851 nch = ap->a_nch;
852 dip = VTOI(ap->a_dvp);
854 if (dip->flags & HAMMER_INODE_RO)
855 return (EROFS);
858 * Create a transaction to cover the operations we perform.
860 hammer_start_transaction(&trans, dip->hmp);
863 * Create a new filesystem object of the requested type. The
864 * returned inode will be referenced but not locked.
866 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
867 if (error) {
868 kprintf("hammer_mkdir error %d\n", error);
869 hammer_done_transaction(&trans);
870 *ap->a_vpp = NULL;
871 return (error);
874 * Add the new filesystem object to the directory. This will also
875 * bump the inode's link count.
877 hammer_lock_sh(&nip->lock);
878 hammer_lock_sh(&dip->lock);
879 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
880 hammer_finalize_inode(&trans, nip, error);
881 hammer_unlock(&dip->lock);
882 hammer_unlock(&nip->lock);
883 if (error)
884 kprintf("hammer_mkdir (add) error %d\n", error);
887 * Finish up.
889 if (error) {
890 hammer_rel_inode(nip, 0);
891 *ap->a_vpp = NULL;
892 } else {
893 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
894 hammer_rel_inode(nip, 0);
895 if (error == 0) {
896 cache_setunresolved(ap->a_nch);
897 cache_setvp(ap->a_nch, *ap->a_vpp);
900 hammer_done_transaction(&trans);
901 return (error);
905 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
907 * The operating system has already ensured that the directory entry
908 * does not exist and done all appropriate namespace locking.
910 static
912 hammer_vop_nmknod(struct vop_nmknod_args *ap)
914 struct hammer_transaction trans;
915 struct hammer_inode *dip;
916 struct hammer_inode *nip;
917 struct nchandle *nch;
918 int error;
920 nch = ap->a_nch;
921 dip = VTOI(ap->a_dvp);
923 if (dip->flags & HAMMER_INODE_RO)
924 return (EROFS);
927 * Create a transaction to cover the operations we perform.
929 hammer_start_transaction(&trans, dip->hmp);
932 * Create a new filesystem object of the requested type. The
933 * returned inode will be referenced but not locked.
935 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
936 if (error) {
937 hammer_done_transaction(&trans);
938 *ap->a_vpp = NULL;
939 return (error);
943 * Add the new filesystem object to the directory. This will also
944 * bump the inode's link count.
946 hammer_lock_sh(&nip->lock);
947 hammer_lock_sh(&dip->lock);
948 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
949 hammer_finalize_inode(&trans, nip, error);
950 hammer_unlock(&dip->lock);
951 hammer_unlock(&nip->lock);
954 * Finish up.
956 if (error) {
957 hammer_rel_inode(nip, 0);
958 *ap->a_vpp = NULL;
959 } else {
960 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
961 hammer_rel_inode(nip, 0);
962 if (error == 0) {
963 cache_setunresolved(ap->a_nch);
964 cache_setvp(ap->a_nch, *ap->a_vpp);
967 hammer_done_transaction(&trans);
968 return (error);
972 * hammer_vop_open { vp, mode, cred, fp }
974 static
976 hammer_vop_open(struct vop_open_args *ap)
978 if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
979 return (EROFS);
981 return(vop_stdopen(ap));
985 * hammer_vop_pathconf { vp, name, retval }
987 static
989 hammer_vop_pathconf(struct vop_pathconf_args *ap)
991 return EOPNOTSUPP;
995 * hammer_vop_print { vp }
997 static
999 hammer_vop_print(struct vop_print_args *ap)
1001 return EOPNOTSUPP;
1005 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1007 static
1009 hammer_vop_readdir(struct vop_readdir_args *ap)
1011 struct hammer_transaction trans;
1012 struct hammer_cursor cursor;
1013 struct hammer_inode *ip;
1014 struct uio *uio;
1015 hammer_record_ondisk_t rec;
1016 hammer_base_elm_t base;
1017 int error;
1018 int cookie_index;
1019 int ncookies;
1020 off_t *cookies;
1021 off_t saveoff;
1022 int r;
1024 ip = VTOI(ap->a_vp);
1025 uio = ap->a_uio;
1026 saveoff = uio->uio_offset;
1028 if (ap->a_ncookies) {
1029 ncookies = uio->uio_resid / 16 + 1;
1030 if (ncookies > 1024)
1031 ncookies = 1024;
1032 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1033 cookie_index = 0;
1034 } else {
1035 ncookies = -1;
1036 cookies = NULL;
1037 cookie_index = 0;
1040 hammer_simple_transaction(&trans, ip->hmp);
1043 * Handle artificial entries
1045 error = 0;
1046 if (saveoff == 0) {
1047 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1048 if (r)
1049 goto done;
1050 if (cookies)
1051 cookies[cookie_index] = saveoff;
1052 ++saveoff;
1053 ++cookie_index;
1054 if (cookie_index == ncookies)
1055 goto done;
1057 if (saveoff == 1) {
1058 if (ip->ino_data.parent_obj_id) {
1059 r = vop_write_dirent(&error, uio,
1060 ip->ino_data.parent_obj_id,
1061 DT_DIR, 2, "..");
1062 } else {
1063 r = vop_write_dirent(&error, uio,
1064 ip->obj_id, DT_DIR, 2, "..");
1066 if (r)
1067 goto done;
1068 if (cookies)
1069 cookies[cookie_index] = saveoff;
1070 ++saveoff;
1071 ++cookie_index;
1072 if (cookie_index == ncookies)
1073 goto done;
1077 * Key range (begin and end inclusive) to scan. Directory keys
1078 * directly translate to a 64 bit 'seek' position.
1080 hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1081 cursor.key_beg.obj_id = ip->obj_id;
1082 cursor.key_beg.create_tid = 0;
1083 cursor.key_beg.delete_tid = 0;
1084 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1085 cursor.key_beg.obj_type = 0;
1086 cursor.key_beg.key = saveoff;
1088 cursor.key_end = cursor.key_beg;
1089 cursor.key_end.key = HAMMER_MAX_KEY;
1090 cursor.asof = ip->obj_asof;
1091 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1093 error = hammer_ip_first(&cursor, ip);
1095 while (error == 0) {
1096 error = hammer_ip_resolve_record_and_data(&cursor);
1097 if (error)
1098 break;
1099 rec = cursor.record;
1100 base = &rec->base.base;
1101 saveoff = base->key;
1103 if (base->obj_id != ip->obj_id)
1104 panic("readdir: bad record at %p", cursor.node);
1106 r = vop_write_dirent(
1107 &error, uio, rec->entry.obj_id,
1108 hammer_get_dtype(rec->entry.base.base.obj_type),
1109 rec->entry.base.data_len,
1110 (void *)cursor.data);
1111 if (r)
1112 break;
1113 ++saveoff;
1114 if (cookies)
1115 cookies[cookie_index] = base->key;
1116 ++cookie_index;
1117 if (cookie_index == ncookies)
1118 break;
1119 error = hammer_ip_next(&cursor);
1121 hammer_done_cursor(&cursor);
1123 done:
1124 hammer_done_transaction(&trans);
1126 if (ap->a_eofflag)
1127 *ap->a_eofflag = (error == ENOENT);
1128 uio->uio_offset = saveoff;
1129 if (error && cookie_index == 0) {
1130 if (error == ENOENT)
1131 error = 0;
1132 if (cookies) {
1133 kfree(cookies, M_TEMP);
1134 *ap->a_ncookies = 0;
1135 *ap->a_cookies = NULL;
1137 } else {
1138 if (error == ENOENT)
1139 error = 0;
1140 if (cookies) {
1141 *ap->a_ncookies = cookie_index;
1142 *ap->a_cookies = cookies;
1145 return(error);
1149 * hammer_vop_readlink { vp, uio, cred }
1151 static
1153 hammer_vop_readlink(struct vop_readlink_args *ap)
1155 struct hammer_transaction trans;
1156 struct hammer_cursor cursor;
1157 struct hammer_inode *ip;
1158 int error;
1160 ip = VTOI(ap->a_vp);
1162 hammer_simple_transaction(&trans, ip->hmp);
1164 hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1167 * Key range (begin and end inclusive) to scan. Directory keys
1168 * directly translate to a 64 bit 'seek' position.
1170 cursor.key_beg.obj_id = ip->obj_id;
1171 cursor.key_beg.create_tid = 0;
1172 cursor.key_beg.delete_tid = 0;
1173 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1174 cursor.key_beg.obj_type = 0;
1175 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1176 cursor.asof = ip->obj_asof;
1177 cursor.flags |= HAMMER_CURSOR_ASOF;
1179 error = hammer_ip_lookup(&cursor, ip);
1180 if (error == 0) {
1181 error = hammer_ip_resolve_data(&cursor);
1182 if (error == 0) {
1183 error = uiomove((char *)cursor.data,
1184 cursor.record->base.data_len,
1185 ap->a_uio);
1188 hammer_done_cursor(&cursor);
1189 hammer_done_transaction(&trans);
1190 return(error);
1194 * hammer_vop_nremove { nch, dvp, cred }
1196 static
1198 hammer_vop_nremove(struct vop_nremove_args *ap)
1200 struct hammer_transaction trans;
1201 int error;
1203 hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1204 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1205 hammer_done_transaction(&trans);
1207 return (error);
1211 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1213 static
1215 hammer_vop_nrename(struct vop_nrename_args *ap)
1217 struct hammer_transaction trans;
1218 struct namecache *fncp;
1219 struct namecache *tncp;
1220 struct hammer_inode *fdip;
1221 struct hammer_inode *tdip;
1222 struct hammer_inode *ip;
1223 struct hammer_cursor cursor;
1224 union hammer_record_ondisk *rec;
1225 int64_t namekey;
1226 int error;
1228 fdip = VTOI(ap->a_fdvp);
1229 tdip = VTOI(ap->a_tdvp);
1230 fncp = ap->a_fnch->ncp;
1231 tncp = ap->a_tnch->ncp;
1232 ip = VTOI(fncp->nc_vp);
1233 KKASSERT(ip != NULL);
1235 if (fdip->flags & HAMMER_INODE_RO)
1236 return (EROFS);
1237 if (tdip->flags & HAMMER_INODE_RO)
1238 return (EROFS);
1239 if (ip->flags & HAMMER_INODE_RO)
1240 return (EROFS);
1242 hammer_start_transaction(&trans, fdip->hmp);
1244 hammer_lock_sh(&ip->lock);
1245 if (fdip->obj_id < tdip->obj_id) {
1246 hammer_lock_sh(&fdip->lock);
1247 hammer_lock_sh(&tdip->lock);
1248 } else {
1249 hammer_lock_sh(&tdip->lock);
1250 hammer_lock_sh(&fdip->lock);
1254 * Remove tncp from the target directory and then link ip as
1255 * tncp. XXX pass trans to dounlink
1257 * Force the inode sync-time to match the transaction so it is
1258 * in-sync with the creation of the target directory entry.
1260 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1261 if (error == 0 || error == ENOENT) {
1262 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1263 if (error == 0) {
1264 ip->ino_data.parent_obj_id = tdip->obj_id;
1265 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1268 if (error)
1269 goto failed; /* XXX */
1272 * Locate the record in the originating directory and remove it.
1274 * Calculate the namekey and setup the key range for the scan. This
1275 * works kinda like a chained hash table where the lower 32 bits
1276 * of the namekey synthesize the chain.
1278 * The key range is inclusive of both key_beg and key_end.
1280 namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1281 retry:
1282 hammer_init_cursor(&trans, &cursor, &fdip->cache[0]);
1283 cursor.key_beg.obj_id = fdip->obj_id;
1284 cursor.key_beg.key = namekey;
1285 cursor.key_beg.create_tid = 0;
1286 cursor.key_beg.delete_tid = 0;
1287 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1288 cursor.key_beg.obj_type = 0;
1290 cursor.key_end = cursor.key_beg;
1291 cursor.key_end.key |= 0xFFFFFFFFULL;
1292 cursor.asof = fdip->obj_asof;
1293 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1296 * Scan all matching records (the chain), locate the one matching
1297 * the requested path component.
1299 * The hammer_ip_*() functions merge in-memory records with on-disk
1300 * records for the purposes of the search.
1302 error = hammer_ip_first(&cursor, fdip);
1303 while (error == 0) {
1304 if (hammer_ip_resolve_data(&cursor) != 0)
1305 break;
1306 rec = cursor.record;
1307 if (fncp->nc_nlen == rec->entry.base.data_len &&
1308 bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1309 break;
1311 error = hammer_ip_next(&cursor);
1315 * If all is ok we have to get the inode so we can adjust nlinks.
1317 * WARNING: hammer_ip_del_directory() may have to terminate the
1318 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
1319 * twice.
1321 if (error == 0)
1322 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1325 * XXX A deadlock here will break rename's atomicy for the purposes
1326 * of crash recovery.
1328 if (error == EDEADLK) {
1329 hammer_unlock(&ip->lock);
1330 hammer_unlock(&fdip->lock);
1331 hammer_unlock(&tdip->lock);
1332 hammer_done_cursor(&cursor);
1333 hammer_lock_sh(&ip->lock);
1334 if (fdip->obj_id < tdip->obj_id) {
1335 hammer_lock_sh(&fdip->lock);
1336 hammer_lock_sh(&tdip->lock);
1337 } else {
1338 hammer_lock_sh(&tdip->lock);
1339 hammer_lock_sh(&fdip->lock);
1341 goto retry;
1345 * Cleanup and tell the kernel that the rename succeeded.
1347 hammer_done_cursor(&cursor);
1348 if (error == 0)
1349 cache_rename(ap->a_fnch, ap->a_tnch);
1351 failed:
1352 hammer_unlock(&ip->lock);
1353 hammer_unlock(&fdip->lock);
1354 hammer_unlock(&tdip->lock);
1355 hammer_done_transaction(&trans);
1356 return (error);
1360 * hammer_vop_nrmdir { nch, dvp, cred }
1362 static
1364 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1366 struct hammer_transaction trans;
1367 int error;
1369 hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1370 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1371 hammer_done_transaction(&trans);
1373 return (error);
1377 * hammer_vop_setattr { vp, vap, cred }
1379 static
1381 hammer_vop_setattr(struct vop_setattr_args *ap)
1383 struct hammer_transaction trans;
1384 struct vattr *vap;
1385 struct hammer_inode *ip;
1386 int modflags;
1387 int error;
1388 int truncating;
1389 off_t aligned_size;
1390 u_int32_t flags;
1391 uuid_t uuid;
1393 vap = ap->a_vap;
1394 ip = ap->a_vp->v_data;
1395 modflags = 0;
1397 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1398 return(EROFS);
1399 if (ip->flags & HAMMER_INODE_RO)
1400 return (EROFS);
1402 hammer_start_transaction(&trans, ip->hmp);
1403 hammer_lock_sh(&ip->lock);
1404 error = 0;
1406 if (vap->va_flags != VNOVAL) {
1407 flags = ip->ino_data.uflags;
1408 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1409 hammer_to_unix_xid(&ip->ino_data.uid),
1410 ap->a_cred);
1411 if (error == 0) {
1412 if (ip->ino_data.uflags != flags) {
1413 ip->ino_data.uflags = flags;
1414 modflags |= HAMMER_INODE_DDIRTY;
1416 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1417 error = 0;
1418 goto done;
1421 goto done;
1423 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1424 error = EPERM;
1425 goto done;
1427 if (vap->va_uid != (uid_t)VNOVAL) {
1428 hammer_guid_to_uuid(&uuid, vap->va_uid);
1429 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1430 ip->ino_data.uid = uuid;
1431 modflags |= HAMMER_INODE_DDIRTY;
1434 if (vap->va_gid != (uid_t)VNOVAL) {
1435 hammer_guid_to_uuid(&uuid, vap->va_gid);
1436 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1437 ip->ino_data.gid = uuid;
1438 modflags |= HAMMER_INODE_DDIRTY;
1441 while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1442 switch(ap->a_vp->v_type) {
1443 case VREG:
1444 if (vap->va_size == ip->ino_rec.ino_size)
1445 break;
1447 * XXX break atomicy, we can deadlock the backend
1448 * if we do not release the lock. Probably not a
1449 * big deal here.
1451 hammer_unlock(&ip->lock);
1452 if (vap->va_size < ip->ino_rec.ino_size) {
1453 vtruncbuf(ap->a_vp, vap->va_size,
1454 HAMMER_BUFSIZE);
1455 truncating = 1;
1456 } else {
1457 vnode_pager_setsize(ap->a_vp, vap->va_size);
1458 truncating = 0;
1460 hammer_lock_sh(&ip->lock);
1461 ip->ino_rec.ino_size = vap->va_size;
1462 modflags |= HAMMER_INODE_RDIRTY;
1463 aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1464 ~HAMMER_BUFMASK64;
1467 * on-media truncation is cached in the inode until
1468 * the inode is synchronized.
1470 if (truncating) {
1471 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1472 ip->flags |= HAMMER_INODE_TRUNCATED;
1473 ip->trunc_off = vap->va_size;
1474 } else if (ip->trunc_off > vap->va_size) {
1475 ip->trunc_off = vap->va_size;
1480 * If truncating we have to clean out a portion of
1481 * the last block on-disk. We do this in the
1482 * front-end buffer cache.
1484 if (truncating && vap->va_size < aligned_size) {
1485 struct buf *bp;
1486 int offset;
1488 offset = vap->va_size & HAMMER_BUFMASK;
1489 error = bread(ap->a_vp,
1490 aligned_size - HAMMER_BUFSIZE,
1491 HAMMER_BUFSIZE, &bp);
1492 if (error == 0) {
1493 bzero(bp->b_data + offset,
1494 HAMMER_BUFSIZE - offset);
1495 bdwrite(bp);
1496 } else {
1497 brelse(bp);
1500 break;
1501 case VDATABASE:
1502 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1503 ip->flags |= HAMMER_INODE_TRUNCATED;
1504 ip->trunc_off = vap->va_size;
1505 } else if (ip->trunc_off > vap->va_size) {
1506 ip->trunc_off = vap->va_size;
1508 ip->ino_rec.ino_size = vap->va_size;
1509 modflags |= HAMMER_INODE_RDIRTY;
1510 break;
1511 default:
1512 error = EINVAL;
1513 goto done;
1515 break;
1517 if (vap->va_atime.tv_sec != VNOVAL) {
1518 ip->ino_rec.ino_atime =
1519 hammer_timespec_to_transid(&vap->va_atime);
1520 modflags |= HAMMER_INODE_ITIMES;
1522 if (vap->va_mtime.tv_sec != VNOVAL) {
1523 ip->ino_rec.ino_mtime =
1524 hammer_timespec_to_transid(&vap->va_mtime);
1525 modflags |= HAMMER_INODE_ITIMES;
1527 if (vap->va_mode != (mode_t)VNOVAL) {
1528 if (ip->ino_data.mode != vap->va_mode) {
1529 ip->ino_data.mode = vap->va_mode;
1530 modflags |= HAMMER_INODE_DDIRTY;
1533 done:
1534 if (error == 0)
1535 hammer_modify_inode(&trans, ip, modflags);
1536 hammer_unlock(&ip->lock);
1537 hammer_done_transaction(&trans);
1538 return (error);
1542 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1544 static
1546 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1548 struct hammer_transaction trans;
1549 struct hammer_inode *dip;
1550 struct hammer_inode *nip;
1551 struct nchandle *nch;
1552 hammer_record_t record;
1553 int error;
1554 int bytes;
1556 ap->a_vap->va_type = VLNK;
1558 nch = ap->a_nch;
1559 dip = VTOI(ap->a_dvp);
1561 if (dip->flags & HAMMER_INODE_RO)
1562 return (EROFS);
1565 * Create a transaction to cover the operations we perform.
1567 hammer_start_transaction(&trans, dip->hmp);
1570 * Create a new filesystem object of the requested type. The
1571 * returned inode will be referenced but not locked.
1574 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1575 if (error) {
1576 hammer_done_transaction(&trans);
1577 *ap->a_vpp = NULL;
1578 return (error);
1582 * Add the new filesystem object to the directory. This will also
1583 * bump the inode's link count.
1585 hammer_lock_sh(&nip->lock);
1586 hammer_lock_sh(&dip->lock);
1587 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1590 * Add a record representing the symlink. symlink stores the link
1591 * as pure data, not a string, and is no \0 terminated.
1593 if (error == 0) {
1594 record = hammer_alloc_mem_record(nip);
1595 bytes = strlen(ap->a_target);
1597 record->rec.base.base.key = HAMMER_FIXKEY_SYMLINK;
1598 record->rec.base.base.rec_type = HAMMER_RECTYPE_FIX;
1599 record->rec.base.data_len = bytes;
1600 record->data = (void *)ap->a_target;
1601 /* will be reallocated by routine below */
1602 error = hammer_ip_add_record(&trans, record);
1605 * Set the file size to the length of the link.
1607 if (error == 0) {
1608 nip->ino_rec.ino_size = bytes;
1609 hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY);
1612 hammer_finalize_inode(&trans, nip, error);
1613 hammer_unlock(&dip->lock);
1614 hammer_unlock(&nip->lock);
1617 * Finish up.
1619 if (error) {
1620 hammer_rel_inode(nip, 0);
1621 *ap->a_vpp = NULL;
1622 } else {
1623 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1624 hammer_rel_inode(nip, 0);
1625 if (error == 0) {
1626 cache_setunresolved(ap->a_nch);
1627 cache_setvp(ap->a_nch, *ap->a_vpp);
1630 hammer_done_transaction(&trans);
1631 return (error);
1635 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1637 static
1639 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1641 struct hammer_transaction trans;
1642 int error;
1644 hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1645 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1646 ap->a_cred, ap->a_flags);
1647 hammer_done_transaction(&trans);
1649 return (error);
1653 * hammer_vop_ioctl { vp, command, data, fflag, cred }
1655 static
1657 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1659 struct hammer_inode *ip = ap->a_vp->v_data;
1661 return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1662 ap->a_fflag, ap->a_cred));
1665 static
1667 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1669 struct mount *mp;
1670 int error;
1672 mp = ap->a_head.a_ops->head.vv_mount;
1674 switch(ap->a_op) {
1675 case MOUNTCTL_SET_EXPORT:
1676 if (ap->a_ctllen != sizeof(struct export_args))
1677 error = EINVAL;
1678 error = hammer_vfs_export(mp, ap->a_op,
1679 (const struct export_args *)ap->a_ctl);
1680 break;
1681 default:
1682 error = journal_mountctl(ap);
1683 break;
1685 return(error);
1689 * hammer_vop_strategy { vp, bio }
1691 * Strategy call, used for regular file read & write only. Note that the
1692 * bp may represent a cluster.
1694 * To simplify operation and allow better optimizations in the future,
1695 * this code does not make any assumptions with regards to buffer alignment
1696 * or size.
1698 static
1700 hammer_vop_strategy(struct vop_strategy_args *ap)
1702 struct buf *bp;
1703 int error;
1705 bp = ap->a_bio->bio_buf;
1707 switch(bp->b_cmd) {
1708 case BUF_CMD_READ:
1709 error = hammer_vop_strategy_read(ap);
1710 break;
1711 case BUF_CMD_WRITE:
1712 error = hammer_vop_strategy_write(ap);
1713 break;
1714 default:
1715 bp->b_error = error = EINVAL;
1716 bp->b_flags |= B_ERROR;
1717 biodone(ap->a_bio);
1718 break;
1720 return (error);
1724 * Read from a regular file. Iterate the related records and fill in the
1725 * BIO/BUF. Gaps are zero-filled.
1727 * The support code in hammer_object.c should be used to deal with mixed
1728 * in-memory and on-disk records.
1730 * XXX atime update
1732 static
1734 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1736 struct hammer_transaction trans;
1737 struct hammer_inode *ip;
1738 struct hammer_cursor cursor;
1739 hammer_record_ondisk_t rec;
1740 hammer_base_elm_t base;
1741 struct bio *bio;
1742 struct buf *bp;
1743 int64_t rec_offset;
1744 int64_t ran_end;
1745 int64_t tmp64;
1746 int error;
1747 int boff;
1748 int roff;
1749 int n;
1751 bio = ap->a_bio;
1752 bp = bio->bio_buf;
1753 ip = ap->a_vp->v_data;
1755 hammer_simple_transaction(&trans, ip->hmp);
1756 hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1759 * Key range (begin and end inclusive) to scan. Note that the key's
1760 * stored in the actual records represent BASE+LEN, not BASE. The
1761 * first record containing bio_offset will have a key > bio_offset.
1763 cursor.key_beg.obj_id = ip->obj_id;
1764 cursor.key_beg.create_tid = 0;
1765 cursor.key_beg.delete_tid = 0;
1766 cursor.key_beg.obj_type = 0;
1767 cursor.key_beg.key = bio->bio_offset + 1;
1768 cursor.asof = ip->obj_asof;
1769 cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1771 cursor.key_end = cursor.key_beg;
1772 KKASSERT(ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_REGFILE);
1773 #if 0
1774 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1775 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1776 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1777 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1778 } else
1779 #endif
1781 ran_end = bio->bio_offset + bp->b_bufsize;
1782 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1783 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1784 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
1785 if (tmp64 < ran_end)
1786 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1787 else
1788 cursor.key_end.key = ran_end + MAXPHYS + 1;
1790 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1792 error = hammer_ip_first(&cursor, ip);
1793 boff = 0;
1795 while (error == 0) {
1796 error = hammer_ip_resolve_data(&cursor);
1797 if (error)
1798 break;
1799 rec = cursor.record;
1800 base = &rec->base.base;
1802 rec_offset = base->key - rec->data.base.data_len;
1805 * Calculate the gap, if any, and zero-fill it.
1807 n = (int)(rec_offset - (bio->bio_offset + boff));
1808 if (n > 0) {
1809 if (n > bp->b_bufsize - boff)
1810 n = bp->b_bufsize - boff;
1811 bzero((char *)bp->b_data + boff, n);
1812 boff += n;
1813 n = 0;
1817 * Calculate the data offset in the record and the number
1818 * of bytes we can copy.
1820 * Note there is a degenerate case here where boff may
1821 * already be at bp->b_bufsize.
1823 roff = -n;
1824 rec_offset += roff;
1825 n = rec->data.base.data_len - roff;
1826 KKASSERT(n > 0);
1827 if (n > bp->b_bufsize - boff)
1828 n = bp->b_bufsize - boff;
1831 * If we cached a truncation point on our front-end the
1832 * on-disk version may still have physical records beyond
1833 * that point. Truncate visibility.
1835 if (ip->trunc_off <= rec_offset)
1836 n = 0;
1837 else if (ip->trunc_off < rec_offset + n)
1838 n = (int)(ip->trunc_off - rec_offset);
1841 * Copy
1843 if (n) {
1844 bcopy((char *)cursor.data + roff,
1845 (char *)bp->b_data + boff, n);
1846 boff += n;
1848 if (boff == bp->b_bufsize)
1849 break;
1850 error = hammer_ip_next(&cursor);
1852 hammer_done_cursor(&cursor);
1853 hammer_done_transaction(&trans);
1856 * There may have been a gap after the last record
1858 if (error == ENOENT)
1859 error = 0;
1860 if (error == 0 && boff != bp->b_bufsize) {
1861 KKASSERT(boff < bp->b_bufsize);
1862 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1863 /* boff = bp->b_bufsize; */
1865 bp->b_resid = 0;
1866 bp->b_error = error;
1867 if (error)
1868 bp->b_flags |= B_ERROR;
1869 biodone(ap->a_bio);
1870 return(error);
1874 * Write to a regular file. Because this is a strategy call the OS is
1875 * trying to actually sync data to the media. HAMMER can only flush
1876 * the entire inode (so the TID remains properly synchronized).
1878 * Basically all we do here is place the bio on the inode's flush queue
1879 * and activate the flusher.
1881 static
1883 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1885 hammer_inode_t ip;
1886 struct bio *bio;
1887 struct buf *bp;
1889 bio = ap->a_bio;
1890 bp = bio->bio_buf;
1891 ip = ap->a_vp->v_data;
1893 if (ip->flags & HAMMER_INODE_RO) {
1894 bp->b_error = EROFS;
1895 bp->b_flags |= B_ERROR;
1896 biodone(ap->a_bio);
1897 return(EROFS);
1901 * If the inode is being flushed we cannot re-queue buffers
1902 * it may have already flushed, or it could result in duplicate
1903 * records in the database.
1905 BUF_KERNPROC(bp);
1906 if (ip->flush_state == HAMMER_FST_FLUSH)
1907 TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
1908 else
1909 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1910 hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY);
1911 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1912 kprintf("a");
1913 return(0);
1917 * Backend code which actually performs the write to the media. This
1918 * routine is typically called from the flusher. The bio will be disposed
1919 * of (biodone'd) by this routine.
1921 * Iterate the related records and mark for deletion. If existing edge
1922 * records (left and right side) overlap our write they have to be marked
1923 * deleted and new records created, usually referencing a portion of the
1924 * original data. Then add a record to represent the buffer.
1927 hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio)
1929 struct buf *bp = bio->bio_buf;
1930 int error;
1932 KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1935 * Delete any records overlapping our range. This function will
1936 * (eventually) properly truncate partial overlaps.
1938 if (ip->sync_ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1939 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1940 bio->bio_offset);
1941 } else {
1942 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1943 bio->bio_offset +
1944 bp->b_bufsize - 1);
1948 * Add a single record to cover the write. We can write a record
1949 * with only the actual file data - for example, a small 200 byte
1950 * file does not have to write out a 16K record.
1952 * While the data size does not have to be aligned, we still do it
1953 * to reduce fragmentation in a future allocation model.
1955 if (error == 0) {
1956 int limit_size;
1958 if (ip->sync_ino_rec.ino_size - bio->bio_offset >
1959 bp->b_bufsize) {
1960 limit_size = bp->b_bufsize;
1961 } else {
1962 limit_size = (int)(ip->sync_ino_rec.ino_size -
1963 bio->bio_offset);
1964 KKASSERT(limit_size >= 0);
1965 limit_size = (limit_size + 63) & ~63;
1968 error = hammer_ip_sync_data(trans, ip, bio->bio_offset,
1969 bp->b_data, limit_size);
1972 if (error)
1973 Debugger("hammer_dowrite: error");
1975 if (error) {
1976 bp->b_resid = bp->b_bufsize;
1977 bp->b_error = error;
1978 bp->b_flags |= B_ERROR;
1979 } else {
1980 bp->b_resid = 0;
1982 biodone(bio);
1983 return(error);
1987 * dounlink - disconnect a directory entry
1989 * XXX whiteout support not really in yet
1991 static int
1992 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
1993 struct vnode *dvp, struct ucred *cred, int flags)
1995 struct namecache *ncp;
1996 hammer_inode_t dip;
1997 hammer_inode_t ip;
1998 hammer_record_ondisk_t rec;
1999 struct hammer_cursor cursor;
2000 int64_t namekey;
2001 int error;
2004 * Calculate the namekey and setup the key range for the scan. This
2005 * works kinda like a chained hash table where the lower 32 bits
2006 * of the namekey synthesize the chain.
2008 * The key range is inclusive of both key_beg and key_end.
2010 dip = VTOI(dvp);
2011 ncp = nch->ncp;
2013 if (dip->flags & HAMMER_INODE_RO)
2014 return (EROFS);
2016 namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2017 retry:
2018 hammer_init_cursor(trans, &cursor, &dip->cache[0]);
2019 cursor.key_beg.obj_id = dip->obj_id;
2020 cursor.key_beg.key = namekey;
2021 cursor.key_beg.create_tid = 0;
2022 cursor.key_beg.delete_tid = 0;
2023 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2024 cursor.key_beg.obj_type = 0;
2026 cursor.key_end = cursor.key_beg;
2027 cursor.key_end.key |= 0xFFFFFFFFULL;
2028 cursor.asof = dip->obj_asof;
2029 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2032 * Scan all matching records (the chain), locate the one matching
2033 * the requested path component. info->last_error contains the
2034 * error code on search termination and could be 0, ENOENT, or
2035 * something else.
2037 * The hammer_ip_*() functions merge in-memory records with on-disk
2038 * records for the purposes of the search.
2040 error = hammer_ip_first(&cursor, dip);
2041 while (error == 0) {
2042 error = hammer_ip_resolve_data(&cursor);
2043 if (error)
2044 break;
2045 rec = cursor.record;
2046 if (ncp->nc_nlen == rec->entry.base.data_len &&
2047 bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
2048 break;
2050 error = hammer_ip_next(&cursor);
2054 * If all is ok we have to get the inode so we can adjust nlinks.
2056 * If the target is a directory, it must be empty.
2058 if (error == 0) {
2059 ip = hammer_get_inode(trans, &dip->cache[1],
2060 rec->entry.obj_id,
2061 dip->hmp->asof, 0, &error);
2062 if (error == ENOENT) {
2063 kprintf("obj_id %016llx\n", rec->entry.obj_id);
2064 Debugger("ENOENT unlinking object that should exist");
2066 if (error == 0 && ip->ino_rec.base.base.obj_type ==
2067 HAMMER_OBJTYPE_DIRECTORY) {
2068 error = hammer_ip_check_directory_empty(trans, ip);
2071 * WARNING: hammer_ip_del_directory() may have to terminate
2072 * the cursor to avoid a lock recursion. It's ok to call
2073 * hammer_done_cursor() twice.
2075 if (error == 0) {
2076 hammer_lock_sh(&ip->lock);
2077 hammer_lock_sh(&dip->lock);
2078 error = hammer_ip_del_directory(trans, &cursor,
2079 dip, ip);
2080 hammer_unlock(&dip->lock);
2081 hammer_unlock(&ip->lock);
2083 if (error == 0) {
2084 cache_setunresolved(nch);
2085 cache_setvp(nch, NULL);
2086 /* XXX locking */
2087 if (ip->vp)
2088 cache_inval_vp(ip->vp, CINV_DESTROY);
2090 hammer_rel_inode(ip, 0);
2092 hammer_done_cursor(&cursor);
2093 if (error == EDEADLK)
2094 goto retry;
2096 return (error);
2099 /************************************************************************
2100 * FIFO AND SPECFS OPS *
2101 ************************************************************************
2105 static int
2106 hammer_vop_fifoclose (struct vop_close_args *ap)
2108 /* XXX update itimes */
2109 return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2112 static int
2113 hammer_vop_fiforead (struct vop_read_args *ap)
2115 int error;
2117 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2118 /* XXX update access time */
2119 return (error);
2122 static int
2123 hammer_vop_fifowrite (struct vop_write_args *ap)
2125 int error;
2127 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2128 /* XXX update access time */
2129 return (error);
2132 static int
2133 hammer_vop_specclose (struct vop_close_args *ap)
2135 /* XXX update itimes */
2136 return (VOCALL(&spec_vnode_vops, &ap->a_head));
2139 static int
2140 hammer_vop_specread (struct vop_read_args *ap)
2142 /* XXX update access time */
2143 return (VOCALL(&spec_vnode_vops, &ap->a_head));
2146 static int
2147 hammer_vop_specwrite (struct vop_write_args *ap)
2149 /* XXX update last change time */
2150 return (VOCALL(&spec_vnode_vops, &ap->a_head));