HAMMER - Fix lost inode issue (primarily with nohistory mounts)
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
blob3e4625b6e1f31c1ac9696c5ffc6eb53cb44e9f23
1 /*
2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
53 * USERFS VNOPS
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
91 struct vop_ops hammer_vnode_vops = {
92 .vop_default = vop_defaultop,
93 .vop_fsync = hammer_vop_fsync,
94 .vop_getpages = vop_stdgetpages,
95 .vop_putpages = vop_stdputpages,
96 .vop_read = hammer_vop_read,
97 .vop_write = hammer_vop_write,
98 .vop_access = hammer_vop_access,
99 .vop_advlock = hammer_vop_advlock,
100 .vop_close = hammer_vop_close,
101 .vop_ncreate = hammer_vop_ncreate,
102 .vop_getattr = hammer_vop_getattr,
103 .vop_inactive = hammer_vop_inactive,
104 .vop_reclaim = hammer_vop_reclaim,
105 .vop_nresolve = hammer_vop_nresolve,
106 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
107 .vop_nlink = hammer_vop_nlink,
108 .vop_nmkdir = hammer_vop_nmkdir,
109 .vop_nmknod = hammer_vop_nmknod,
110 .vop_open = hammer_vop_open,
111 .vop_pathconf = vop_stdpathconf,
112 .vop_print = hammer_vop_print,
113 .vop_readdir = hammer_vop_readdir,
114 .vop_readlink = hammer_vop_readlink,
115 .vop_nremove = hammer_vop_nremove,
116 .vop_nrename = hammer_vop_nrename,
117 .vop_nrmdir = hammer_vop_nrmdir,
118 .vop_markatime = hammer_vop_markatime,
119 .vop_setattr = hammer_vop_setattr,
120 .vop_bmap = hammer_vop_bmap,
121 .vop_strategy = hammer_vop_strategy,
122 .vop_nsymlink = hammer_vop_nsymlink,
123 .vop_nwhiteout = hammer_vop_nwhiteout,
124 .vop_ioctl = hammer_vop_ioctl,
125 .vop_mountctl = hammer_vop_mountctl,
126 .vop_kqfilter = hammer_vop_kqfilter
129 struct vop_ops hammer_spec_vops = {
130 .vop_default = vop_defaultop,
131 .vop_fsync = hammer_vop_fsync,
132 .vop_read = vop_stdnoread,
133 .vop_write = vop_stdnowrite,
134 .vop_access = hammer_vop_access,
135 .vop_close = hammer_vop_close,
136 .vop_markatime = hammer_vop_markatime,
137 .vop_getattr = hammer_vop_getattr,
138 .vop_inactive = hammer_vop_inactive,
139 .vop_reclaim = hammer_vop_reclaim,
140 .vop_setattr = hammer_vop_setattr
143 struct vop_ops hammer_fifo_vops = {
144 .vop_default = fifo_vnoperate,
145 .vop_fsync = hammer_vop_fsync,
146 .vop_read = hammer_vop_fiforead,
147 .vop_write = hammer_vop_fifowrite,
148 .vop_access = hammer_vop_access,
149 .vop_close = hammer_vop_fifoclose,
150 .vop_markatime = hammer_vop_markatime,
151 .vop_getattr = hammer_vop_getattr,
152 .vop_inactive = hammer_vop_inactive,
153 .vop_reclaim = hammer_vop_reclaim,
154 .vop_setattr = hammer_vop_setattr,
155 .vop_kqfilter = hammer_vop_fifokqfilter
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
162 if (flags)
163 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171 struct vnode *dvp, struct ucred *cred,
172 int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
176 #if 0
177 static
179 hammer_vop_vnoperate(struct vop_generic_args *)
181 return (VOCALL(&hammer_vnode_vops, ap));
183 #endif
186 * hammer_vop_fsync { vp, waitfor }
188 * fsync() an inode to disk and wait for it to be completely committed
189 * such that the information would not be undone if a crash occured after
190 * return.
192 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
193 * a REDO log. A sysctl is provided to relax HAMMER's fsync()
194 * operation.
196 * Ultimately the combination of a REDO log and use of fast storage
197 * to front-end cluster caches will make fsync fast, but it aint
198 * here yet. And, in anycase, we need real transactional
199 * all-or-nothing features which are not restricted to a single file.
201 static
203 hammer_vop_fsync(struct vop_fsync_args *ap)
205 hammer_inode_t ip = VTOI(ap->a_vp);
206 int waitfor = ap->a_waitfor;
209 * Fsync rule relaxation (default disabled)
211 if (ap->a_flags & VOP_FSYNC_SYSCALL) {
212 switch(hammer_fsync_mode) {
213 case 0:
214 /* full semantics */
215 break;
216 case 1:
217 /* asynchronous */
218 if (waitfor == MNT_WAIT)
219 waitfor = MNT_NOWAIT;
220 break;
221 case 2:
222 /* synchronous fsync on close */
223 ip->flags |= HAMMER_INODE_CLOSESYNC;
224 return(0);
225 case 3:
226 /* asynchronous fsync on close */
227 ip->flags |= HAMMER_INODE_CLOSEASYNC;
228 return(0);
229 default:
230 /* ignore the fsync() system call */
231 return(0);
236 * Go do it
238 ++hammer_count_fsyncs;
239 vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
240 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
241 if (waitfor == MNT_WAIT) {
242 vn_unlock(ap->a_vp);
243 hammer_wait_inode(ip);
244 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
246 return (ip->error);
250 * hammer_vop_read { vp, uio, ioflag, cred }
252 * MPALMOSTSAFE
254 static
256 hammer_vop_read(struct vop_read_args *ap)
258 struct hammer_transaction trans;
259 hammer_inode_t ip;
260 off_t offset;
261 struct buf *bp;
262 struct uio *uio;
263 int error;
264 int n;
265 int seqcount;
266 int ioseqcount;
267 int blksize;
268 int got_mplock;
269 int bigread;
271 if (ap->a_vp->v_type != VREG)
272 return (EINVAL);
273 ip = VTOI(ap->a_vp);
274 error = 0;
275 uio = ap->a_uio;
278 * Allow the UIO's size to override the sequential heuristic.
280 blksize = hammer_blocksize(uio->uio_offset);
281 seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
282 ioseqcount = ap->a_ioflag >> 16;
283 if (seqcount < ioseqcount)
284 seqcount = ioseqcount;
287 * Temporary hack until more of HAMMER can be made MPSAFE.
289 #ifdef SMP
290 if (curthread->td_mpcount) {
291 got_mplock = -1;
292 hammer_start_transaction(&trans, ip->hmp);
293 } else {
294 got_mplock = 0;
296 #else
297 hammer_start_transaction(&trans, ip->hmp);
298 got_mplock = -1;
299 #endif
302 * If reading or writing a huge amount of data we have to break
303 * atomicy and allow the operation to be interrupted by a signal
304 * or it can DOS the machine.
306 bigread = (uio->uio_resid > 100 * 1024 * 1024);
309 * Access the data typically in HAMMER_BUFSIZE blocks via the
310 * buffer cache, but HAMMER may use a variable block size based
311 * on the offset.
313 * XXX Temporary hack, delay the start transaction while we remain
314 * MPSAFE. NOTE: ino_data.size cannot change while vnode is
315 * locked-shared.
317 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
318 int64_t base_offset;
319 int64_t file_limit;
321 blksize = hammer_blocksize(uio->uio_offset);
322 offset = (int)uio->uio_offset & (blksize - 1);
323 base_offset = uio->uio_offset - offset;
325 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
326 break;
329 * MPSAFE
331 bp = getcacheblk(ap->a_vp, base_offset);
332 if (bp) {
333 error = 0;
334 goto skip;
338 * MPUNSAFE
340 if (got_mplock == 0) {
341 got_mplock = 1;
342 get_mplock();
343 hammer_start_transaction(&trans, ip->hmp);
346 if (hammer_cluster_enable) {
348 * Use file_limit to prevent cluster_read() from
349 * creating buffers of the wrong block size past
350 * the demarc.
352 file_limit = ip->ino_data.size;
353 if (base_offset < HAMMER_XDEMARC &&
354 file_limit > HAMMER_XDEMARC) {
355 file_limit = HAMMER_XDEMARC;
357 error = cluster_read(ap->a_vp,
358 file_limit, base_offset,
359 blksize, MAXPHYS,
360 seqcount, &bp);
361 } else {
362 error = bread(ap->a_vp, base_offset, blksize, &bp);
364 if (error) {
365 kprintf("error %d\n", error);
366 brelse(bp);
367 break;
369 skip:
371 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
372 n = blksize - offset;
373 if (n > uio->uio_resid)
374 n = uio->uio_resid;
375 if (n > ip->ino_data.size - uio->uio_offset)
376 n = (int)(ip->ino_data.size - uio->uio_offset);
377 error = uiomove((char *)bp->b_data + offset, n, uio);
379 /* data has a lower priority then meta-data */
380 bp->b_flags |= B_AGE;
381 bqrelse(bp);
382 if (error)
383 break;
384 hammer_stats_file_read += n;
388 * XXX only update the atime if we had to get the MP lock.
389 * XXX hack hack hack, fixme.
391 if (got_mplock) {
392 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
393 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
394 ip->ino_data.atime = trans.time;
395 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
397 hammer_done_transaction(&trans);
398 if (got_mplock > 0)
399 rel_mplock();
401 return (error);
405 * hammer_vop_write { vp, uio, ioflag, cred }
407 static
409 hammer_vop_write(struct vop_write_args *ap)
411 struct hammer_transaction trans;
412 struct hammer_inode *ip;
413 hammer_mount_t hmp;
414 struct uio *uio;
415 int offset;
416 off_t base_offset;
417 struct buf *bp;
418 int kflags;
419 int error;
420 int n;
421 int flags;
422 int seqcount;
423 int bigwrite;
425 if (ap->a_vp->v_type != VREG)
426 return (EINVAL);
427 ip = VTOI(ap->a_vp);
428 hmp = ip->hmp;
429 error = 0;
430 kflags = 0;
431 seqcount = ap->a_ioflag >> 16;
433 if (ip->flags & HAMMER_INODE_RO)
434 return (EROFS);
437 * Create a transaction to cover the operations we perform.
439 hammer_start_transaction(&trans, hmp);
440 uio = ap->a_uio;
443 * Check append mode
445 if (ap->a_ioflag & IO_APPEND)
446 uio->uio_offset = ip->ino_data.size;
449 * Check for illegal write offsets. Valid range is 0...2^63-1.
451 * NOTE: the base_off assignment is required to work around what
452 * I consider to be a GCC-4 optimization bug.
454 if (uio->uio_offset < 0) {
455 hammer_done_transaction(&trans);
456 return (EFBIG);
458 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
459 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
460 hammer_done_transaction(&trans);
461 return (EFBIG);
465 * If reading or writing a huge amount of data we have to break
466 * atomicy and allow the operation to be interrupted by a signal
467 * or it can DOS the machine.
469 bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
472 * Access the data typically in HAMMER_BUFSIZE blocks via the
473 * buffer cache, but HAMMER may use a variable block size based
474 * on the offset.
476 while (uio->uio_resid > 0) {
477 int fixsize = 0;
478 int blksize;
479 int blkmask;
481 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
482 break;
483 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
484 break;
486 blksize = hammer_blocksize(uio->uio_offset);
489 * Do not allow HAMMER to blow out the buffer cache. Very
490 * large UIOs can lockout other processes due to bwillwrite()
491 * mechanics.
493 * The hammer inode is not locked during these operations.
494 * The vnode is locked which can interfere with the pageout
495 * daemon for non-UIO_NOCOPY writes but should not interfere
496 * with the buffer cache. Even so, we cannot afford to
497 * allow the pageout daemon to build up too many dirty buffer
498 * cache buffers.
500 * Only call this if we aren't being recursively called from
501 * a virtual disk device (vn), else we may deadlock.
503 if ((ap->a_ioflag & IO_RECURSE) == 0)
504 bwillwrite(blksize);
507 * Control the number of pending records associated with
508 * this inode. If too many have accumulated start a
509 * flush. Try to maintain a pipeline with the flusher.
511 if (ip->rsv_recs >= hammer_limit_inode_recs) {
512 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
514 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
515 while (ip->rsv_recs >= hammer_limit_inode_recs) {
516 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
518 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
521 #if 0
523 * Do not allow HAMMER to blow out system memory by
524 * accumulating too many records. Records are so well
525 * decoupled from the buffer cache that it is possible
526 * for userland to push data out to the media via
527 * direct-write, but build up the records queued to the
528 * backend faster then the backend can flush them out.
529 * HAMMER has hit its write limit but the frontend has
530 * no pushback to slow it down.
532 if (hmp->rsv_recs > hammer_limit_recs / 2) {
534 * Get the inode on the flush list
536 if (ip->rsv_recs >= 64)
537 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
538 else if (ip->rsv_recs >= 16)
539 hammer_flush_inode(ip, 0);
542 * Keep the flusher going if the system keeps
543 * queueing records.
545 delta = hmp->count_newrecords -
546 hmp->last_newrecords;
547 if (delta < 0 || delta > hammer_limit_recs / 2) {
548 hmp->last_newrecords = hmp->count_newrecords;
549 hammer_sync_hmp(hmp, MNT_NOWAIT);
553 * If we have gotten behind start slowing
554 * down the writers.
556 delta = (hmp->rsv_recs - hammer_limit_recs) *
557 hz / hammer_limit_recs;
558 if (delta > 0)
559 tsleep(&trans, 0, "hmrslo", delta);
561 #endif
564 * Calculate the blocksize at the current offset and figure
565 * out how much we can actually write.
567 blkmask = blksize - 1;
568 offset = (int)uio->uio_offset & blkmask;
569 base_offset = uio->uio_offset & ~(int64_t)blkmask;
570 n = blksize - offset;
571 if (n > uio->uio_resid)
572 n = uio->uio_resid;
573 if (uio->uio_offset + n > ip->ino_data.size) {
574 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
575 fixsize = 1;
576 kflags |= NOTE_EXTEND;
579 if (uio->uio_segflg == UIO_NOCOPY) {
581 * Issuing a write with the same data backing the
582 * buffer. Instantiate the buffer to collect the
583 * backing vm pages, then read-in any missing bits.
585 * This case is used by vop_stdputpages().
587 bp = getblk(ap->a_vp, base_offset,
588 blksize, GETBLK_BHEAVY, 0);
589 if ((bp->b_flags & B_CACHE) == 0) {
590 bqrelse(bp);
591 error = bread(ap->a_vp, base_offset,
592 blksize, &bp);
594 } else if (offset == 0 && uio->uio_resid >= blksize) {
596 * Even though we are entirely overwriting the buffer
597 * we may still have to zero it out to avoid a
598 * mmap/write visibility issue.
600 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
601 if ((bp->b_flags & B_CACHE) == 0)
602 vfs_bio_clrbuf(bp);
603 } else if (base_offset >= ip->ino_data.size) {
605 * If the base offset of the buffer is beyond the
606 * file EOF, we don't have to issue a read.
608 bp = getblk(ap->a_vp, base_offset,
609 blksize, GETBLK_BHEAVY, 0);
610 vfs_bio_clrbuf(bp);
611 } else {
613 * Partial overwrite, read in any missing bits then
614 * replace the portion being written.
616 error = bread(ap->a_vp, base_offset, blksize, &bp);
617 if (error == 0)
618 bheavy(bp);
620 if (error == 0) {
621 error = uiomove((char *)bp->b_data + offset,
622 n, uio);
626 * If we screwed up we have to undo any VM size changes we
627 * made.
629 if (error) {
630 brelse(bp);
631 if (fixsize) {
632 vtruncbuf(ap->a_vp, ip->ino_data.size,
633 hammer_blocksize(ip->ino_data.size));
635 break;
637 kflags |= NOTE_WRITE;
638 hammer_stats_file_write += n;
639 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
640 if (ip->ino_data.size < uio->uio_offset) {
641 ip->ino_data.size = uio->uio_offset;
642 flags = HAMMER_INODE_DDIRTY;
643 vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
644 } else {
645 flags = 0;
647 ip->ino_data.mtime = trans.time;
648 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
649 hammer_modify_inode(ip, flags);
652 * Once we dirty the buffer any cached zone-X offset
653 * becomes invalid. HAMMER NOTE: no-history mode cannot
654 * allow overwriting over the same data sector unless
655 * we provide UNDOs for the old data, which we don't.
657 bp->b_bio2.bio_offset = NOOFFSET;
660 * Final buffer disposition.
662 * Because meta-data updates are deferred, HAMMER is
663 * especially sensitive to excessive bdwrite()s because
664 * the I/O stream is not broken up by disk reads. So the
665 * buffer cache simply cannot keep up.
667 * WARNING! blksize is variable. cluster_write() is
668 * expected to not blow up if it encounters buffers that
669 * do not match the passed blksize.
671 * NOTE! Hammer shouldn't need to bawrite()/cluster_write().
672 * The ip->rsv_recs check should burst-flush the data.
673 * If we queue it immediately the buf could be left
674 * locked on the device queue for a very long time.
676 bp->b_flags |= B_AGE;
677 if (ap->a_ioflag & IO_SYNC) {
678 bwrite(bp);
679 } else if (ap->a_ioflag & IO_DIRECT) {
680 bawrite(bp);
681 } else {
682 #if 0
683 if (offset + n == blksize) {
684 if (hammer_cluster_enable == 0 ||
685 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
686 bawrite(bp);
687 } else {
688 cluster_write(bp, ip->ino_data.size,
689 blksize, seqcount);
691 } else {
692 #endif
693 bdwrite(bp);
696 hammer_done_transaction(&trans);
697 hammer_knote(ap->a_vp, kflags);
698 return (error);
702 * hammer_vop_access { vp, mode, cred }
704 static
706 hammer_vop_access(struct vop_access_args *ap)
708 struct hammer_inode *ip = VTOI(ap->a_vp);
709 uid_t uid;
710 gid_t gid;
711 int error;
713 ++hammer_stats_file_iopsr;
714 uid = hammer_to_unix_xid(&ip->ino_data.uid);
715 gid = hammer_to_unix_xid(&ip->ino_data.gid);
717 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
718 ip->ino_data.uflags);
719 return (error);
723 * hammer_vop_advlock { vp, id, op, fl, flags }
725 static
727 hammer_vop_advlock(struct vop_advlock_args *ap)
729 hammer_inode_t ip = VTOI(ap->a_vp);
731 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
735 * hammer_vop_close { vp, fflag }
737 * We can only sync-on-close for normal closes.
739 static
741 hammer_vop_close(struct vop_close_args *ap)
743 struct vnode *vp = ap->a_vp;
744 hammer_inode_t ip = VTOI(vp);
745 int waitfor;
747 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
748 if (vn_islocked(vp) == LK_EXCLUSIVE &&
749 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
750 if (ip->flags & HAMMER_INODE_CLOSESYNC)
751 waitfor = MNT_WAIT;
752 else
753 waitfor = MNT_NOWAIT;
754 ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
755 HAMMER_INODE_CLOSEASYNC);
756 VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
759 return (vop_stdclose(ap));
763 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
765 * The operating system has already ensured that the directory entry
766 * does not exist and done all appropriate namespace locking.
768 static
770 hammer_vop_ncreate(struct vop_ncreate_args *ap)
772 struct hammer_transaction trans;
773 struct hammer_inode *dip;
774 struct hammer_inode *nip;
775 struct nchandle *nch;
776 int error;
778 nch = ap->a_nch;
779 dip = VTOI(ap->a_dvp);
781 if (dip->flags & HAMMER_INODE_RO)
782 return (EROFS);
783 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
784 return (error);
787 * Create a transaction to cover the operations we perform.
789 hammer_start_transaction(&trans, dip->hmp);
790 ++hammer_stats_file_iopsw;
793 * Create a new filesystem object of the requested type. The
794 * returned inode will be referenced and shared-locked to prevent
795 * it from being moved to the flusher.
797 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
798 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
799 NULL, &nip);
800 if (error) {
801 hkprintf("hammer_create_inode error %d\n", error);
802 hammer_done_transaction(&trans);
803 *ap->a_vpp = NULL;
804 return (error);
808 * Add the new filesystem object to the directory. This will also
809 * bump the inode's link count.
811 error = hammer_ip_add_directory(&trans, dip,
812 nch->ncp->nc_name, nch->ncp->nc_nlen,
813 nip);
814 if (error)
815 hkprintf("hammer_ip_add_directory error %d\n", error);
818 * Finish up.
820 if (error) {
821 hammer_rel_inode(nip, 0);
822 hammer_done_transaction(&trans);
823 *ap->a_vpp = NULL;
824 } else {
825 error = hammer_get_vnode(nip, ap->a_vpp);
826 hammer_done_transaction(&trans);
827 hammer_rel_inode(nip, 0);
828 if (error == 0) {
829 cache_setunresolved(ap->a_nch);
830 cache_setvp(ap->a_nch, *ap->a_vpp);
832 hammer_knote(ap->a_dvp, NOTE_WRITE);
834 return (error);
838 * hammer_vop_getattr { vp, vap }
840 * Retrieve an inode's attribute information. When accessing inodes
841 * historically we fake the atime field to ensure consistent results.
842 * The atime field is stored in the B-Tree element and allowed to be
843 * updated without cycling the element.
845 * MPSAFE
847 static
849 hammer_vop_getattr(struct vop_getattr_args *ap)
851 struct hammer_inode *ip = VTOI(ap->a_vp);
852 struct vattr *vap = ap->a_vap;
855 * We want the fsid to be different when accessing a filesystem
856 * with different as-of's so programs like diff don't think
857 * the files are the same.
859 * We also want the fsid to be the same when comparing snapshots,
860 * or when comparing mirrors (which might be backed by different
861 * physical devices). HAMMER fsids are based on the PFS's
862 * shared_uuid field.
864 * XXX there is a chance of collision here. The va_fsid reported
865 * by stat is different from the more involved fsid used in the
866 * mount structure.
868 ++hammer_stats_file_iopsr;
869 hammer_lock_sh(&ip->lock);
870 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
871 (u_int32_t)(ip->obj_asof >> 32);
873 vap->va_fileid = ip->ino_leaf.base.obj_id;
874 vap->va_mode = ip->ino_data.mode;
875 vap->va_nlink = ip->ino_data.nlinks;
876 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
877 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
878 vap->va_rmajor = 0;
879 vap->va_rminor = 0;
880 vap->va_size = ip->ino_data.size;
883 * Special case for @@PFS softlinks. The actual size of the
884 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
885 * or for MAX_TID is "@@-1:%05d" == 10 bytes.
887 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
888 ip->ino_data.size == 10 &&
889 ip->obj_asof == HAMMER_MAX_TID &&
890 ip->obj_localization == 0 &&
891 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
892 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
893 vap->va_size = 26;
894 else
895 vap->va_size = 10;
899 * We must provide a consistent atime and mtime for snapshots
900 * so people can do a 'tar cf - ... | md5' on them and get
901 * consistent results.
903 if (ip->flags & HAMMER_INODE_RO) {
904 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
905 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
906 } else {
907 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
908 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
910 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
911 vap->va_flags = ip->ino_data.uflags;
912 vap->va_gen = 1; /* hammer inums are unique for all time */
913 vap->va_blocksize = HAMMER_BUFSIZE;
914 if (ip->ino_data.size >= HAMMER_XDEMARC) {
915 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
916 ~HAMMER_XBUFMASK64;
917 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
918 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
919 ~HAMMER_BUFMASK64;
920 } else {
921 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
924 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
925 vap->va_filerev = 0; /* XXX */
926 /* mtime uniquely identifies any adjustments made to the file XXX */
927 vap->va_fsmid = ip->ino_data.mtime;
928 vap->va_uid_uuid = ip->ino_data.uid;
929 vap->va_gid_uuid = ip->ino_data.gid;
930 vap->va_fsid_uuid = ip->hmp->fsid;
931 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
932 VA_FSID_UUID_VALID;
934 switch (ip->ino_data.obj_type) {
935 case HAMMER_OBJTYPE_CDEV:
936 case HAMMER_OBJTYPE_BDEV:
937 vap->va_rmajor = ip->ino_data.rmajor;
938 vap->va_rminor = ip->ino_data.rminor;
939 break;
940 default:
941 break;
943 hammer_unlock(&ip->lock);
944 return(0);
948 * hammer_vop_nresolve { nch, dvp, cred }
950 * Locate the requested directory entry.
952 static
954 hammer_vop_nresolve(struct vop_nresolve_args *ap)
956 struct hammer_transaction trans;
957 struct namecache *ncp;
958 hammer_inode_t dip;
959 hammer_inode_t ip;
960 hammer_tid_t asof;
961 struct hammer_cursor cursor;
962 struct vnode *vp;
963 int64_t namekey;
964 int error;
965 int i;
966 int nlen;
967 int flags;
968 int ispfs;
969 int64_t obj_id;
970 u_int32_t localization;
971 u_int32_t max_iterations;
974 * Misc initialization, plus handle as-of name extensions. Look for
975 * the '@@' extension. Note that as-of files and directories cannot
976 * be modified.
978 dip = VTOI(ap->a_dvp);
979 ncp = ap->a_nch->ncp;
980 asof = dip->obj_asof;
981 localization = dip->obj_localization; /* for code consistency */
982 nlen = ncp->nc_nlen;
983 flags = dip->flags & HAMMER_INODE_RO;
984 ispfs = 0;
986 hammer_simple_transaction(&trans, dip->hmp);
987 ++hammer_stats_file_iopsr;
989 for (i = 0; i < nlen; ++i) {
990 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
991 error = hammer_str_to_tid(ncp->nc_name + i + 2,
992 &ispfs, &asof, &localization);
993 if (error != 0) {
994 i = nlen;
995 break;
997 if (asof != HAMMER_MAX_TID)
998 flags |= HAMMER_INODE_RO;
999 break;
1002 nlen = i;
1005 * If this is a PFS softlink we dive into the PFS
1007 if (ispfs && nlen == 0) {
1008 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1009 asof, localization,
1010 flags, &error);
1011 if (error == 0) {
1012 error = hammer_get_vnode(ip, &vp);
1013 hammer_rel_inode(ip, 0);
1014 } else {
1015 vp = NULL;
1017 if (error == 0) {
1018 vn_unlock(vp);
1019 cache_setvp(ap->a_nch, vp);
1020 vrele(vp);
1022 goto done;
1026 * If there is no path component the time extension is relative to dip.
1027 * e.g. "fubar/@@<snapshot>"
1029 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1030 * e.g. "fubar/.@@<snapshot>"
1032 * ".." is handled by the kernel. We do not currently handle
1033 * "..@<snapshot>".
1035 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1036 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1037 asof, dip->obj_localization,
1038 flags, &error);
1039 if (error == 0) {
1040 error = hammer_get_vnode(ip, &vp);
1041 hammer_rel_inode(ip, 0);
1042 } else {
1043 vp = NULL;
1045 if (error == 0) {
1046 vn_unlock(vp);
1047 cache_setvp(ap->a_nch, vp);
1048 vrele(vp);
1050 goto done;
1054 * Calculate the namekey and setup the key range for the scan. This
1055 * works kinda like a chained hash table where the lower 32 bits
1056 * of the namekey synthesize the chain.
1058 * The key range is inclusive of both key_beg and key_end.
1060 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1061 &max_iterations);
1063 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1064 cursor.key_beg.localization = dip->obj_localization +
1065 hammer_dir_localization(dip);
1066 cursor.key_beg.obj_id = dip->obj_id;
1067 cursor.key_beg.key = namekey;
1068 cursor.key_beg.create_tid = 0;
1069 cursor.key_beg.delete_tid = 0;
1070 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1071 cursor.key_beg.obj_type = 0;
1073 cursor.key_end = cursor.key_beg;
1074 cursor.key_end.key += max_iterations;
1075 cursor.asof = asof;
1076 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1079 * Scan all matching records (the chain), locate the one matching
1080 * the requested path component.
1082 * The hammer_ip_*() functions merge in-memory records with on-disk
1083 * records for the purposes of the search.
1085 obj_id = 0;
1086 localization = HAMMER_DEF_LOCALIZATION;
1088 if (error == 0) {
1089 error = hammer_ip_first(&cursor);
1090 while (error == 0) {
1091 error = hammer_ip_resolve_data(&cursor);
1092 if (error)
1093 break;
1094 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1095 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1096 obj_id = cursor.data->entry.obj_id;
1097 localization = cursor.data->entry.localization;
1098 break;
1100 error = hammer_ip_next(&cursor);
1103 hammer_done_cursor(&cursor);
1106 * Lookup the obj_id. This should always succeed. If it does not
1107 * the filesystem may be damaged and we return a dummy inode.
1109 if (error == 0) {
1110 ip = hammer_get_inode(&trans, dip, obj_id,
1111 asof, localization,
1112 flags, &error);
1113 if (error == ENOENT) {
1114 kprintf("HAMMER: WARNING: Missing "
1115 "inode for dirent \"%s\"\n"
1116 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1117 ncp->nc_name,
1118 (long long)obj_id, (long long)asof,
1119 localization);
1120 error = 0;
1121 ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1122 asof, localization,
1123 flags, &error);
1125 if (error == 0) {
1126 error = hammer_get_vnode(ip, &vp);
1127 hammer_rel_inode(ip, 0);
1128 } else {
1129 vp = NULL;
1131 if (error == 0) {
1132 vn_unlock(vp);
1133 cache_setvp(ap->a_nch, vp);
1134 vrele(vp);
1136 } else if (error == ENOENT) {
1137 cache_setvp(ap->a_nch, NULL);
1139 done:
1140 hammer_done_transaction(&trans);
1141 return (error);
1145 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1147 * Locate the parent directory of a directory vnode.
1149 * dvp is referenced but not locked. *vpp must be returned referenced and
1150 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
1151 * at the root, instead it could indicate that the directory we were in was
1152 * removed.
1154 * NOTE: as-of sequences are not linked into the directory structure. If
1155 * we are at the root with a different asof then the mount point, reload
1156 * the same directory with the mount point's asof. I'm not sure what this
1157 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
1158 * get confused, but it hasn't been tested.
1160 static
1162 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1164 struct hammer_transaction trans;
1165 struct hammer_inode *dip;
1166 struct hammer_inode *ip;
1167 int64_t parent_obj_id;
1168 u_int32_t parent_obj_localization;
1169 hammer_tid_t asof;
1170 int error;
1172 dip = VTOI(ap->a_dvp);
1173 asof = dip->obj_asof;
1176 * Whos are parent? This could be the root of a pseudo-filesystem
1177 * whos parent is in another localization domain.
1179 parent_obj_id = dip->ino_data.parent_obj_id;
1180 if (dip->obj_id == HAMMER_OBJID_ROOT)
1181 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1182 else
1183 parent_obj_localization = dip->obj_localization;
1185 if (parent_obj_id == 0) {
1186 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1187 asof != dip->hmp->asof) {
1188 parent_obj_id = dip->obj_id;
1189 asof = dip->hmp->asof;
1190 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1191 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1192 (long long)dip->obj_asof);
1193 } else {
1194 *ap->a_vpp = NULL;
1195 return ENOENT;
1199 hammer_simple_transaction(&trans, dip->hmp);
1200 ++hammer_stats_file_iopsr;
1202 ip = hammer_get_inode(&trans, dip, parent_obj_id,
1203 asof, parent_obj_localization,
1204 dip->flags, &error);
1205 if (ip) {
1206 error = hammer_get_vnode(ip, ap->a_vpp);
1207 hammer_rel_inode(ip, 0);
1208 } else {
1209 *ap->a_vpp = NULL;
1211 hammer_done_transaction(&trans);
1212 return (error);
1216 * hammer_vop_nlink { nch, dvp, vp, cred }
1218 static
1220 hammer_vop_nlink(struct vop_nlink_args *ap)
1222 struct hammer_transaction trans;
1223 struct hammer_inode *dip;
1224 struct hammer_inode *ip;
1225 struct nchandle *nch;
1226 int error;
1228 if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1229 return(EXDEV);
1231 nch = ap->a_nch;
1232 dip = VTOI(ap->a_dvp);
1233 ip = VTOI(ap->a_vp);
1235 if (dip->obj_localization != ip->obj_localization)
1236 return(EXDEV);
1238 if (dip->flags & HAMMER_INODE_RO)
1239 return (EROFS);
1240 if (ip->flags & HAMMER_INODE_RO)
1241 return (EROFS);
1242 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1243 return (error);
1246 * Create a transaction to cover the operations we perform.
1248 hammer_start_transaction(&trans, dip->hmp);
1249 ++hammer_stats_file_iopsw;
1252 * Add the filesystem object to the directory. Note that neither
1253 * dip nor ip are referenced or locked, but their vnodes are
1254 * referenced. This function will bump the inode's link count.
1256 error = hammer_ip_add_directory(&trans, dip,
1257 nch->ncp->nc_name, nch->ncp->nc_nlen,
1258 ip);
1261 * Finish up.
1263 if (error == 0) {
1264 cache_setunresolved(nch);
1265 cache_setvp(nch, ap->a_vp);
1267 hammer_done_transaction(&trans);
1268 hammer_knote(ap->a_vp, NOTE_LINK);
1269 hammer_knote(ap->a_dvp, NOTE_WRITE);
1270 return (error);
1274 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1276 * The operating system has already ensured that the directory entry
1277 * does not exist and done all appropriate namespace locking.
1279 static
1281 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1283 struct hammer_transaction trans;
1284 struct hammer_inode *dip;
1285 struct hammer_inode *nip;
1286 struct nchandle *nch;
1287 int error;
1289 nch = ap->a_nch;
1290 dip = VTOI(ap->a_dvp);
1292 if (dip->flags & HAMMER_INODE_RO)
1293 return (EROFS);
1294 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1295 return (error);
1298 * Create a transaction to cover the operations we perform.
1300 hammer_start_transaction(&trans, dip->hmp);
1301 ++hammer_stats_file_iopsw;
1304 * Create a new filesystem object of the requested type. The
1305 * returned inode will be referenced but not locked.
1307 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1308 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1309 NULL, &nip);
1310 if (error) {
1311 hkprintf("hammer_mkdir error %d\n", error);
1312 hammer_done_transaction(&trans);
1313 *ap->a_vpp = NULL;
1314 return (error);
1317 * Add the new filesystem object to the directory. This will also
1318 * bump the inode's link count.
1320 error = hammer_ip_add_directory(&trans, dip,
1321 nch->ncp->nc_name, nch->ncp->nc_nlen,
1322 nip);
1323 if (error)
1324 hkprintf("hammer_mkdir (add) error %d\n", error);
1327 * Finish up.
1329 if (error) {
1330 hammer_rel_inode(nip, 0);
1331 *ap->a_vpp = NULL;
1332 } else {
1333 error = hammer_get_vnode(nip, ap->a_vpp);
1334 hammer_rel_inode(nip, 0);
1335 if (error == 0) {
1336 cache_setunresolved(ap->a_nch);
1337 cache_setvp(ap->a_nch, *ap->a_vpp);
1340 hammer_done_transaction(&trans);
1341 if (error == 0)
1342 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1343 return (error);
1347 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1349 * The operating system has already ensured that the directory entry
1350 * does not exist and done all appropriate namespace locking.
1352 static
1354 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1356 struct hammer_transaction trans;
1357 struct hammer_inode *dip;
1358 struct hammer_inode *nip;
1359 struct nchandle *nch;
1360 int error;
1362 nch = ap->a_nch;
1363 dip = VTOI(ap->a_dvp);
1365 if (dip->flags & HAMMER_INODE_RO)
1366 return (EROFS);
1367 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1368 return (error);
1371 * Create a transaction to cover the operations we perform.
1373 hammer_start_transaction(&trans, dip->hmp);
1374 ++hammer_stats_file_iopsw;
1377 * Create a new filesystem object of the requested type. The
1378 * returned inode will be referenced but not locked.
1380 * If mknod specifies a directory a pseudo-fs is created.
1382 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1383 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1384 NULL, &nip);
1385 if (error) {
1386 hammer_done_transaction(&trans);
1387 *ap->a_vpp = NULL;
1388 return (error);
1392 * Add the new filesystem object to the directory. This will also
1393 * bump the inode's link count.
1395 error = hammer_ip_add_directory(&trans, dip,
1396 nch->ncp->nc_name, nch->ncp->nc_nlen,
1397 nip);
1400 * Finish up.
1402 if (error) {
1403 hammer_rel_inode(nip, 0);
1404 *ap->a_vpp = NULL;
1405 } else {
1406 error = hammer_get_vnode(nip, ap->a_vpp);
1407 hammer_rel_inode(nip, 0);
1408 if (error == 0) {
1409 cache_setunresolved(ap->a_nch);
1410 cache_setvp(ap->a_nch, *ap->a_vpp);
1413 hammer_done_transaction(&trans);
1414 if (error == 0)
1415 hammer_knote(ap->a_dvp, NOTE_WRITE);
1416 return (error);
1420 * hammer_vop_open { vp, mode, cred, fp }
1422 static
1424 hammer_vop_open(struct vop_open_args *ap)
1426 hammer_inode_t ip;
1428 ++hammer_stats_file_iopsr;
1429 ip = VTOI(ap->a_vp);
1431 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1432 return (EROFS);
1433 return(vop_stdopen(ap));
1437 * hammer_vop_print { vp }
1439 static
1441 hammer_vop_print(struct vop_print_args *ap)
1443 return EOPNOTSUPP;
1447 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1449 static
1451 hammer_vop_readdir(struct vop_readdir_args *ap)
1453 struct hammer_transaction trans;
1454 struct hammer_cursor cursor;
1455 struct hammer_inode *ip;
1456 struct uio *uio;
1457 hammer_base_elm_t base;
1458 int error;
1459 int cookie_index;
1460 int ncookies;
1461 off_t *cookies;
1462 off_t saveoff;
1463 int r;
1464 int dtype;
1466 ++hammer_stats_file_iopsr;
1467 ip = VTOI(ap->a_vp);
1468 uio = ap->a_uio;
1469 saveoff = uio->uio_offset;
1471 if (ap->a_ncookies) {
1472 ncookies = uio->uio_resid / 16 + 1;
1473 if (ncookies > 1024)
1474 ncookies = 1024;
1475 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1476 cookie_index = 0;
1477 } else {
1478 ncookies = -1;
1479 cookies = NULL;
1480 cookie_index = 0;
1483 hammer_simple_transaction(&trans, ip->hmp);
1486 * Handle artificial entries
1488 * It should be noted that the minimum value for a directory
1489 * hash key on-media is 0x0000000100000000, so we can use anything
1490 * less then that to represent our 'special' key space.
1492 error = 0;
1493 if (saveoff == 0) {
1494 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1495 if (r)
1496 goto done;
1497 if (cookies)
1498 cookies[cookie_index] = saveoff;
1499 ++saveoff;
1500 ++cookie_index;
1501 if (cookie_index == ncookies)
1502 goto done;
1504 if (saveoff == 1) {
1505 if (ip->ino_data.parent_obj_id) {
1506 r = vop_write_dirent(&error, uio,
1507 ip->ino_data.parent_obj_id,
1508 DT_DIR, 2, "..");
1509 } else {
1510 r = vop_write_dirent(&error, uio,
1511 ip->obj_id, DT_DIR, 2, "..");
1513 if (r)
1514 goto done;
1515 if (cookies)
1516 cookies[cookie_index] = saveoff;
1517 ++saveoff;
1518 ++cookie_index;
1519 if (cookie_index == ncookies)
1520 goto done;
1524 * Key range (begin and end inclusive) to scan. Directory keys
1525 * directly translate to a 64 bit 'seek' position.
1527 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1528 cursor.key_beg.localization = ip->obj_localization +
1529 hammer_dir_localization(ip);
1530 cursor.key_beg.obj_id = ip->obj_id;
1531 cursor.key_beg.create_tid = 0;
1532 cursor.key_beg.delete_tid = 0;
1533 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1534 cursor.key_beg.obj_type = 0;
1535 cursor.key_beg.key = saveoff;
1537 cursor.key_end = cursor.key_beg;
1538 cursor.key_end.key = HAMMER_MAX_KEY;
1539 cursor.asof = ip->obj_asof;
1540 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1542 error = hammer_ip_first(&cursor);
1544 while (error == 0) {
1545 error = hammer_ip_resolve_data(&cursor);
1546 if (error)
1547 break;
1548 base = &cursor.leaf->base;
1549 saveoff = base->key;
1550 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1552 if (base->obj_id != ip->obj_id)
1553 panic("readdir: bad record at %p", cursor.node);
1556 * Convert pseudo-filesystems into softlinks
1558 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1559 r = vop_write_dirent(
1560 &error, uio, cursor.data->entry.obj_id,
1561 dtype,
1562 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1563 (void *)cursor.data->entry.name);
1564 if (r)
1565 break;
1566 ++saveoff;
1567 if (cookies)
1568 cookies[cookie_index] = base->key;
1569 ++cookie_index;
1570 if (cookie_index == ncookies)
1571 break;
1572 error = hammer_ip_next(&cursor);
1574 hammer_done_cursor(&cursor);
1576 done:
1577 hammer_done_transaction(&trans);
1579 if (ap->a_eofflag)
1580 *ap->a_eofflag = (error == ENOENT);
1581 uio->uio_offset = saveoff;
1582 if (error && cookie_index == 0) {
1583 if (error == ENOENT)
1584 error = 0;
1585 if (cookies) {
1586 kfree(cookies, M_TEMP);
1587 *ap->a_ncookies = 0;
1588 *ap->a_cookies = NULL;
1590 } else {
1591 if (error == ENOENT)
1592 error = 0;
1593 if (cookies) {
1594 *ap->a_ncookies = cookie_index;
1595 *ap->a_cookies = cookies;
1598 return(error);
1602 * hammer_vop_readlink { vp, uio, cred }
1604 static
1606 hammer_vop_readlink(struct vop_readlink_args *ap)
1608 struct hammer_transaction trans;
1609 struct hammer_cursor cursor;
1610 struct hammer_inode *ip;
1611 char buf[32];
1612 u_int32_t localization;
1613 hammer_pseudofs_inmem_t pfsm;
1614 int error;
1616 ip = VTOI(ap->a_vp);
1619 * Shortcut if the symlink data was stuffed into ino_data.
1621 * Also expand special "@@PFS%05d" softlinks (expansion only
1622 * occurs for non-historical (current) accesses made from the
1623 * primary filesystem).
1625 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1626 char *ptr;
1627 int bytes;
1629 ptr = ip->ino_data.ext.symlink;
1630 bytes = (int)ip->ino_data.size;
1631 if (bytes == 10 &&
1632 ip->obj_asof == HAMMER_MAX_TID &&
1633 ip->obj_localization == 0 &&
1634 strncmp(ptr, "@@PFS", 5) == 0) {
1635 hammer_simple_transaction(&trans, ip->hmp);
1636 bcopy(ptr + 5, buf, 5);
1637 buf[5] = 0;
1638 localization = strtoul(buf, NULL, 10) << 16;
1639 pfsm = hammer_load_pseudofs(&trans, localization,
1640 &error);
1641 if (error == 0) {
1642 if (pfsm->pfsd.mirror_flags &
1643 HAMMER_PFSD_SLAVE) {
1644 /* vap->va_size == 26 */
1645 ksnprintf(buf, sizeof(buf),
1646 "@@0x%016llx:%05d",
1647 (long long)pfsm->pfsd.sync_end_tid,
1648 localization >> 16);
1649 } else {
1650 /* vap->va_size == 10 */
1651 ksnprintf(buf, sizeof(buf),
1652 "@@-1:%05d",
1653 localization >> 16);
1654 #if 0
1655 ksnprintf(buf, sizeof(buf),
1656 "@@0x%016llx:%05d",
1657 (long long)HAMMER_MAX_TID,
1658 localization >> 16);
1659 #endif
1661 ptr = buf;
1662 bytes = strlen(buf);
1664 if (pfsm)
1665 hammer_rel_pseudofs(trans.hmp, pfsm);
1666 hammer_done_transaction(&trans);
1668 error = uiomove(ptr, bytes, ap->a_uio);
1669 return(error);
1673 * Long version
1675 hammer_simple_transaction(&trans, ip->hmp);
1676 ++hammer_stats_file_iopsr;
1677 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1680 * Key range (begin and end inclusive) to scan. Directory keys
1681 * directly translate to a 64 bit 'seek' position.
1683 cursor.key_beg.localization = ip->obj_localization +
1684 HAMMER_LOCALIZE_MISC;
1685 cursor.key_beg.obj_id = ip->obj_id;
1686 cursor.key_beg.create_tid = 0;
1687 cursor.key_beg.delete_tid = 0;
1688 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1689 cursor.key_beg.obj_type = 0;
1690 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1691 cursor.asof = ip->obj_asof;
1692 cursor.flags |= HAMMER_CURSOR_ASOF;
1694 error = hammer_ip_lookup(&cursor);
1695 if (error == 0) {
1696 error = hammer_ip_resolve_data(&cursor);
1697 if (error == 0) {
1698 KKASSERT(cursor.leaf->data_len >=
1699 HAMMER_SYMLINK_NAME_OFF);
1700 error = uiomove(cursor.data->symlink.name,
1701 cursor.leaf->data_len -
1702 HAMMER_SYMLINK_NAME_OFF,
1703 ap->a_uio);
1706 hammer_done_cursor(&cursor);
1707 hammer_done_transaction(&trans);
1708 return(error);
1712 * hammer_vop_nremove { nch, dvp, cred }
1714 static
1716 hammer_vop_nremove(struct vop_nremove_args *ap)
1718 struct hammer_transaction trans;
1719 struct hammer_inode *dip;
1720 int error;
1722 dip = VTOI(ap->a_dvp);
1724 if (hammer_nohistory(dip) == 0 &&
1725 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1726 return (error);
1729 hammer_start_transaction(&trans, dip->hmp);
1730 ++hammer_stats_file_iopsw;
1731 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1732 hammer_done_transaction(&trans);
1733 if (error == 0)
1734 hammer_knote(ap->a_dvp, NOTE_WRITE);
1735 return (error);
1739 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1741 static
1743 hammer_vop_nrename(struct vop_nrename_args *ap)
1745 struct hammer_transaction trans;
1746 struct namecache *fncp;
1747 struct namecache *tncp;
1748 struct hammer_inode *fdip;
1749 struct hammer_inode *tdip;
1750 struct hammer_inode *ip;
1751 struct hammer_cursor cursor;
1752 int64_t namekey;
1753 u_int32_t max_iterations;
1754 int nlen, error;
1756 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1757 return(EXDEV);
1758 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1759 return(EXDEV);
1761 fdip = VTOI(ap->a_fdvp);
1762 tdip = VTOI(ap->a_tdvp);
1763 fncp = ap->a_fnch->ncp;
1764 tncp = ap->a_tnch->ncp;
1765 ip = VTOI(fncp->nc_vp);
1766 KKASSERT(ip != NULL);
1768 if (fdip->obj_localization != tdip->obj_localization)
1769 return(EXDEV);
1770 if (fdip->obj_localization != ip->obj_localization)
1771 return(EXDEV);
1773 if (fdip->flags & HAMMER_INODE_RO)
1774 return (EROFS);
1775 if (tdip->flags & HAMMER_INODE_RO)
1776 return (EROFS);
1777 if (ip->flags & HAMMER_INODE_RO)
1778 return (EROFS);
1779 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1780 return (error);
1782 hammer_start_transaction(&trans, fdip->hmp);
1783 ++hammer_stats_file_iopsw;
1786 * Remove tncp from the target directory and then link ip as
1787 * tncp. XXX pass trans to dounlink
1789 * Force the inode sync-time to match the transaction so it is
1790 * in-sync with the creation of the target directory entry.
1792 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1793 ap->a_cred, 0, -1);
1794 if (error == 0 || error == ENOENT) {
1795 error = hammer_ip_add_directory(&trans, tdip,
1796 tncp->nc_name, tncp->nc_nlen,
1797 ip);
1798 if (error == 0) {
1799 ip->ino_data.parent_obj_id = tdip->obj_id;
1800 ip->ino_data.ctime = trans.time;
1801 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1804 if (error)
1805 goto failed; /* XXX */
1808 * Locate the record in the originating directory and remove it.
1810 * Calculate the namekey and setup the key range for the scan. This
1811 * works kinda like a chained hash table where the lower 32 bits
1812 * of the namekey synthesize the chain.
1814 * The key range is inclusive of both key_beg and key_end.
1816 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1817 &max_iterations);
1818 retry:
1819 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1820 cursor.key_beg.localization = fdip->obj_localization +
1821 hammer_dir_localization(fdip);
1822 cursor.key_beg.obj_id = fdip->obj_id;
1823 cursor.key_beg.key = namekey;
1824 cursor.key_beg.create_tid = 0;
1825 cursor.key_beg.delete_tid = 0;
1826 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1827 cursor.key_beg.obj_type = 0;
1829 cursor.key_end = cursor.key_beg;
1830 cursor.key_end.key += max_iterations;
1831 cursor.asof = fdip->obj_asof;
1832 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1835 * Scan all matching records (the chain), locate the one matching
1836 * the requested path component.
1838 * The hammer_ip_*() functions merge in-memory records with on-disk
1839 * records for the purposes of the search.
1841 error = hammer_ip_first(&cursor);
1842 while (error == 0) {
1843 if (hammer_ip_resolve_data(&cursor) != 0)
1844 break;
1845 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1846 KKASSERT(nlen > 0);
1847 if (fncp->nc_nlen == nlen &&
1848 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1849 break;
1851 error = hammer_ip_next(&cursor);
1855 * If all is ok we have to get the inode so we can adjust nlinks.
1857 * WARNING: hammer_ip_del_directory() may have to terminate the
1858 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
1859 * twice.
1861 if (error == 0)
1862 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1865 * XXX A deadlock here will break rename's atomicy for the purposes
1866 * of crash recovery.
1868 if (error == EDEADLK) {
1869 hammer_done_cursor(&cursor);
1870 goto retry;
1874 * Cleanup and tell the kernel that the rename succeeded.
1876 hammer_done_cursor(&cursor);
1877 if (error == 0) {
1878 cache_rename(ap->a_fnch, ap->a_tnch);
1879 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1880 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1881 if (ip->vp)
1882 hammer_knote(ip->vp, NOTE_RENAME);
1885 failed:
1886 hammer_done_transaction(&trans);
1887 return (error);
1891 * hammer_vop_nrmdir { nch, dvp, cred }
1893 static
1895 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1897 struct hammer_transaction trans;
1898 struct hammer_inode *dip;
1899 int error;
1901 dip = VTOI(ap->a_dvp);
1903 if (hammer_nohistory(dip) == 0 &&
1904 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1905 return (error);
1908 hammer_start_transaction(&trans, dip->hmp);
1909 ++hammer_stats_file_iopsw;
1910 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1911 hammer_done_transaction(&trans);
1912 if (error == 0)
1913 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1914 return (error);
1918 * hammer_vop_markatime { vp, cred }
1920 static
1922 hammer_vop_markatime(struct vop_markatime_args *ap)
1924 struct hammer_transaction trans;
1925 struct hammer_inode *ip;
1927 ip = VTOI(ap->a_vp);
1928 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1929 return (EROFS);
1930 if (ip->flags & HAMMER_INODE_RO)
1931 return (EROFS);
1932 if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1933 return (0);
1934 hammer_start_transaction(&trans, ip->hmp);
1935 ++hammer_stats_file_iopsw;
1937 ip->ino_data.atime = trans.time;
1938 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1939 hammer_done_transaction(&trans);
1940 hammer_knote(ap->a_vp, NOTE_ATTRIB);
1941 return (0);
1945 * hammer_vop_setattr { vp, vap, cred }
1947 static
1949 hammer_vop_setattr(struct vop_setattr_args *ap)
1951 struct hammer_transaction trans;
1952 struct vattr *vap;
1953 struct hammer_inode *ip;
1954 int modflags;
1955 int error;
1956 int truncating;
1957 int blksize;
1958 int kflags;
1959 int64_t aligned_size;
1960 u_int32_t flags;
1962 vap = ap->a_vap;
1963 ip = ap->a_vp->v_data;
1964 modflags = 0;
1965 kflags = 0;
1967 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1968 return(EROFS);
1969 if (ip->flags & HAMMER_INODE_RO)
1970 return (EROFS);
1971 if (hammer_nohistory(ip) == 0 &&
1972 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1973 return (error);
1976 hammer_start_transaction(&trans, ip->hmp);
1977 ++hammer_stats_file_iopsw;
1978 error = 0;
1980 if (vap->va_flags != VNOVAL) {
1981 flags = ip->ino_data.uflags;
1982 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1983 hammer_to_unix_xid(&ip->ino_data.uid),
1984 ap->a_cred);
1985 if (error == 0) {
1986 if (ip->ino_data.uflags != flags) {
1987 ip->ino_data.uflags = flags;
1988 ip->ino_data.ctime = trans.time;
1989 modflags |= HAMMER_INODE_DDIRTY;
1990 kflags |= NOTE_ATTRIB;
1992 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1993 error = 0;
1994 goto done;
1997 goto done;
1999 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2000 error = EPERM;
2001 goto done;
2003 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2004 mode_t cur_mode = ip->ino_data.mode;
2005 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2006 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2007 uuid_t uuid_uid;
2008 uuid_t uuid_gid;
2010 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2011 ap->a_cred,
2012 &cur_uid, &cur_gid, &cur_mode);
2013 if (error == 0) {
2014 hammer_guid_to_uuid(&uuid_uid, cur_uid);
2015 hammer_guid_to_uuid(&uuid_gid, cur_gid);
2016 if (bcmp(&uuid_uid, &ip->ino_data.uid,
2017 sizeof(uuid_uid)) ||
2018 bcmp(&uuid_gid, &ip->ino_data.gid,
2019 sizeof(uuid_gid)) ||
2020 ip->ino_data.mode != cur_mode
2022 ip->ino_data.uid = uuid_uid;
2023 ip->ino_data.gid = uuid_gid;
2024 ip->ino_data.mode = cur_mode;
2025 ip->ino_data.ctime = trans.time;
2026 modflags |= HAMMER_INODE_DDIRTY;
2028 kflags |= NOTE_ATTRIB;
2031 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2032 switch(ap->a_vp->v_type) {
2033 case VREG:
2034 if (vap->va_size == ip->ino_data.size)
2035 break;
2037 * XXX break atomicy, we can deadlock the backend
2038 * if we do not release the lock. Probably not a
2039 * big deal here.
2041 blksize = hammer_blocksize(vap->va_size);
2042 if (vap->va_size < ip->ino_data.size) {
2043 vtruncbuf(ap->a_vp, vap->va_size, blksize);
2044 truncating = 1;
2045 kflags |= NOTE_WRITE;
2046 } else {
2047 vnode_pager_setsize(ap->a_vp, vap->va_size);
2048 truncating = 0;
2049 kflags |= NOTE_WRITE | NOTE_EXTEND;
2051 ip->ino_data.size = vap->va_size;
2052 ip->ino_data.mtime = trans.time;
2053 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2056 * on-media truncation is cached in the inode until
2057 * the inode is synchronized.
2059 if (truncating) {
2060 hammer_ip_frontend_trunc(ip, vap->va_size);
2061 #ifdef DEBUG_TRUNCATE
2062 if (HammerTruncIp == NULL)
2063 HammerTruncIp = ip;
2064 #endif
2065 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2066 ip->flags |= HAMMER_INODE_TRUNCATED;
2067 ip->trunc_off = vap->va_size;
2068 #ifdef DEBUG_TRUNCATE
2069 if (ip == HammerTruncIp)
2070 kprintf("truncate1 %016llx\n",
2071 (long long)ip->trunc_off);
2072 #endif
2073 } else if (ip->trunc_off > vap->va_size) {
2074 ip->trunc_off = vap->va_size;
2075 #ifdef DEBUG_TRUNCATE
2076 if (ip == HammerTruncIp)
2077 kprintf("truncate2 %016llx\n",
2078 (long long)ip->trunc_off);
2079 #endif
2080 } else {
2081 #ifdef DEBUG_TRUNCATE
2082 if (ip == HammerTruncIp)
2083 kprintf("truncate3 %016llx (ignored)\n",
2084 (long long)vap->va_size);
2085 #endif
2090 * If truncating we have to clean out a portion of
2091 * the last block on-disk. We do this in the
2092 * front-end buffer cache.
2094 aligned_size = (vap->va_size + (blksize - 1)) &
2095 ~(int64_t)(blksize - 1);
2096 if (truncating && vap->va_size < aligned_size) {
2097 struct buf *bp;
2098 int offset;
2100 aligned_size -= blksize;
2102 offset = (int)vap->va_size & (blksize - 1);
2103 error = bread(ap->a_vp, aligned_size,
2104 blksize, &bp);
2105 hammer_ip_frontend_trunc(ip, aligned_size);
2106 if (error == 0) {
2107 bzero(bp->b_data + offset,
2108 blksize - offset);
2109 /* must de-cache direct-io offset */
2110 bp->b_bio2.bio_offset = NOOFFSET;
2111 bdwrite(bp);
2112 } else {
2113 kprintf("ERROR %d\n", error);
2114 brelse(bp);
2117 break;
2118 case VDATABASE:
2119 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2120 ip->flags |= HAMMER_INODE_TRUNCATED;
2121 ip->trunc_off = vap->va_size;
2122 } else if (ip->trunc_off > vap->va_size) {
2123 ip->trunc_off = vap->va_size;
2125 hammer_ip_frontend_trunc(ip, vap->va_size);
2126 ip->ino_data.size = vap->va_size;
2127 ip->ino_data.mtime = trans.time;
2128 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2129 kflags |= NOTE_ATTRIB;
2130 break;
2131 default:
2132 error = EINVAL;
2133 goto done;
2135 break;
2137 if (vap->va_atime.tv_sec != VNOVAL) {
2138 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2139 modflags |= HAMMER_INODE_ATIME;
2140 kflags |= NOTE_ATTRIB;
2142 if (vap->va_mtime.tv_sec != VNOVAL) {
2143 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2144 modflags |= HAMMER_INODE_MTIME;
2145 kflags |= NOTE_ATTRIB;
2147 if (vap->va_mode != (mode_t)VNOVAL) {
2148 mode_t cur_mode = ip->ino_data.mode;
2149 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2150 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2152 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2153 cur_uid, cur_gid, &cur_mode);
2154 if (error == 0 && ip->ino_data.mode != cur_mode) {
2155 ip->ino_data.mode = cur_mode;
2156 ip->ino_data.ctime = trans.time;
2157 modflags |= HAMMER_INODE_DDIRTY;
2158 kflags |= NOTE_ATTRIB;
2161 done:
2162 if (error == 0)
2163 hammer_modify_inode(ip, modflags);
2164 hammer_done_transaction(&trans);
2165 hammer_knote(ap->a_vp, kflags);
2166 return (error);
2170 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2172 static
2174 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2176 struct hammer_transaction trans;
2177 struct hammer_inode *dip;
2178 struct hammer_inode *nip;
2179 struct nchandle *nch;
2180 hammer_record_t record;
2181 int error;
2182 int bytes;
2184 ap->a_vap->va_type = VLNK;
2186 nch = ap->a_nch;
2187 dip = VTOI(ap->a_dvp);
2189 if (dip->flags & HAMMER_INODE_RO)
2190 return (EROFS);
2191 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2192 return (error);
2195 * Create a transaction to cover the operations we perform.
2197 hammer_start_transaction(&trans, dip->hmp);
2198 ++hammer_stats_file_iopsw;
2201 * Create a new filesystem object of the requested type. The
2202 * returned inode will be referenced but not locked.
2205 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2206 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2207 NULL, &nip);
2208 if (error) {
2209 hammer_done_transaction(&trans);
2210 *ap->a_vpp = NULL;
2211 return (error);
2215 * Add a record representing the symlink. symlink stores the link
2216 * as pure data, not a string, and is no \0 terminated.
2218 if (error == 0) {
2219 bytes = strlen(ap->a_target);
2221 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2222 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2223 } else {
2224 record = hammer_alloc_mem_record(nip, bytes);
2225 record->type = HAMMER_MEM_RECORD_GENERAL;
2227 record->leaf.base.localization = nip->obj_localization +
2228 HAMMER_LOCALIZE_MISC;
2229 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2230 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2231 record->leaf.data_len = bytes;
2232 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2233 bcopy(ap->a_target, record->data->symlink.name, bytes);
2234 error = hammer_ip_add_record(&trans, record);
2238 * Set the file size to the length of the link.
2240 if (error == 0) {
2241 nip->ino_data.size = bytes;
2242 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2245 if (error == 0)
2246 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2247 nch->ncp->nc_nlen, nip);
2250 * Finish up.
2252 if (error) {
2253 hammer_rel_inode(nip, 0);
2254 *ap->a_vpp = NULL;
2255 } else {
2256 error = hammer_get_vnode(nip, ap->a_vpp);
2257 hammer_rel_inode(nip, 0);
2258 if (error == 0) {
2259 cache_setunresolved(ap->a_nch);
2260 cache_setvp(ap->a_nch, *ap->a_vpp);
2261 hammer_knote(ap->a_dvp, NOTE_WRITE);
2264 hammer_done_transaction(&trans);
2265 return (error);
2269 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2271 static
2273 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2275 struct hammer_transaction trans;
2276 struct hammer_inode *dip;
2277 int error;
2279 dip = VTOI(ap->a_dvp);
2281 if (hammer_nohistory(dip) == 0 &&
2282 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2283 return (error);
2286 hammer_start_transaction(&trans, dip->hmp);
2287 ++hammer_stats_file_iopsw;
2288 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2289 ap->a_cred, ap->a_flags, -1);
2290 hammer_done_transaction(&trans);
2292 return (error);
2296 * hammer_vop_ioctl { vp, command, data, fflag, cred }
2298 static
2300 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2302 struct hammer_inode *ip = ap->a_vp->v_data;
2304 ++hammer_stats_file_iopsr;
2305 return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2306 ap->a_fflag, ap->a_cred));
2309 static
2311 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2313 static const struct mountctl_opt extraopt[] = {
2314 { HMNT_NOHISTORY, "nohistory" },
2315 { HMNT_MASTERID, "master" },
2316 { 0, NULL}
2319 struct hammer_mount *hmp;
2320 struct mount *mp;
2321 int usedbytes;
2322 int error;
2324 error = 0;
2325 usedbytes = 0;
2326 mp = ap->a_head.a_ops->head.vv_mount;
2327 KKASSERT(mp->mnt_data != NULL);
2328 hmp = (struct hammer_mount *)mp->mnt_data;
2330 switch(ap->a_op) {
2332 case MOUNTCTL_SET_EXPORT:
2333 if (ap->a_ctllen != sizeof(struct export_args))
2334 error = EINVAL;
2335 else
2336 error = hammer_vfs_export(mp, ap->a_op,
2337 (const struct export_args *)ap->a_ctl);
2338 break;
2339 case MOUNTCTL_MOUNTFLAGS:
2342 * Call standard mountctl VOP function
2343 * so we get user mount flags.
2345 error = vop_stdmountctl(ap);
2346 if (error)
2347 break;
2349 usedbytes = *ap->a_res;
2351 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2352 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2353 ap->a_buflen - usedbytes,
2354 &error);
2357 *ap->a_res += usedbytes;
2358 break;
2360 default:
2361 error = vop_stdmountctl(ap);
2362 break;
2364 return(error);
2368 * hammer_vop_strategy { vp, bio }
2370 * Strategy call, used for regular file read & write only. Note that the
2371 * bp may represent a cluster.
2373 * To simplify operation and allow better optimizations in the future,
2374 * this code does not make any assumptions with regards to buffer alignment
2375 * or size.
2377 static
2379 hammer_vop_strategy(struct vop_strategy_args *ap)
2381 struct buf *bp;
2382 int error;
2384 bp = ap->a_bio->bio_buf;
2386 switch(bp->b_cmd) {
2387 case BUF_CMD_READ:
2388 error = hammer_vop_strategy_read(ap);
2389 break;
2390 case BUF_CMD_WRITE:
2391 error = hammer_vop_strategy_write(ap);
2392 break;
2393 default:
2394 bp->b_error = error = EINVAL;
2395 bp->b_flags |= B_ERROR;
2396 biodone(ap->a_bio);
2397 break;
2399 return (error);
2403 * Read from a regular file. Iterate the related records and fill in the
2404 * BIO/BUF. Gaps are zero-filled.
2406 * The support code in hammer_object.c should be used to deal with mixed
2407 * in-memory and on-disk records.
2409 * NOTE: Can be called from the cluster code with an oversized buf.
2411 * XXX atime update
2413 static
2415 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2417 struct hammer_transaction trans;
2418 struct hammer_inode *ip;
2419 struct hammer_inode *dip;
2420 struct hammer_cursor cursor;
2421 hammer_base_elm_t base;
2422 hammer_off_t disk_offset;
2423 struct bio *bio;
2424 struct bio *nbio;
2425 struct buf *bp;
2426 int64_t rec_offset;
2427 int64_t ran_end;
2428 int64_t tmp64;
2429 int error;
2430 int boff;
2431 int roff;
2432 int n;
2434 bio = ap->a_bio;
2435 bp = bio->bio_buf;
2436 ip = ap->a_vp->v_data;
2439 * The zone-2 disk offset may have been set by the cluster code via
2440 * a BMAP operation, or else should be NOOFFSET.
2442 * Checking the high bits for a match against zone-2 should suffice.
2444 nbio = push_bio(bio);
2445 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2446 HAMMER_ZONE_LARGE_DATA) {
2447 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2448 return (error);
2452 * Well, that sucked. Do it the hard way. If all the stars are
2453 * aligned we may still be able to issue a direct-read.
2455 hammer_simple_transaction(&trans, ip->hmp);
2456 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2459 * Key range (begin and end inclusive) to scan. Note that the key's
2460 * stored in the actual records represent BASE+LEN, not BASE. The
2461 * first record containing bio_offset will have a key > bio_offset.
2463 cursor.key_beg.localization = ip->obj_localization +
2464 HAMMER_LOCALIZE_MISC;
2465 cursor.key_beg.obj_id = ip->obj_id;
2466 cursor.key_beg.create_tid = 0;
2467 cursor.key_beg.delete_tid = 0;
2468 cursor.key_beg.obj_type = 0;
2469 cursor.key_beg.key = bio->bio_offset + 1;
2470 cursor.asof = ip->obj_asof;
2471 cursor.flags |= HAMMER_CURSOR_ASOF;
2473 cursor.key_end = cursor.key_beg;
2474 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2475 #if 0
2476 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2477 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2478 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2479 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2480 } else
2481 #endif
2483 ran_end = bio->bio_offset + bp->b_bufsize;
2484 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2485 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2486 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2487 if (tmp64 < ran_end)
2488 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2489 else
2490 cursor.key_end.key = ran_end + MAXPHYS + 1;
2492 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2494 error = hammer_ip_first(&cursor);
2495 boff = 0;
2497 while (error == 0) {
2499 * Get the base file offset of the record. The key for
2500 * data records is (base + bytes) rather then (base).
2502 base = &cursor.leaf->base;
2503 rec_offset = base->key - cursor.leaf->data_len;
2506 * Calculate the gap, if any, and zero-fill it.
2508 * n is the offset of the start of the record verses our
2509 * current seek offset in the bio.
2511 n = (int)(rec_offset - (bio->bio_offset + boff));
2512 if (n > 0) {
2513 if (n > bp->b_bufsize - boff)
2514 n = bp->b_bufsize - boff;
2515 bzero((char *)bp->b_data + boff, n);
2516 boff += n;
2517 n = 0;
2521 * Calculate the data offset in the record and the number
2522 * of bytes we can copy.
2524 * There are two degenerate cases. First, boff may already
2525 * be at bp->b_bufsize. Secondly, the data offset within
2526 * the record may exceed the record's size.
2528 roff = -n;
2529 rec_offset += roff;
2530 n = cursor.leaf->data_len - roff;
2531 if (n <= 0) {
2532 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2533 n = 0;
2534 } else if (n > bp->b_bufsize - boff) {
2535 n = bp->b_bufsize - boff;
2539 * Deal with cached truncations. This cool bit of code
2540 * allows truncate()/ftruncate() to avoid having to sync
2541 * the file.
2543 * If the frontend is truncated then all backend records are
2544 * subject to the frontend's truncation.
2546 * If the backend is truncated then backend records on-disk
2547 * (but not in-memory) are subject to the backend's
2548 * truncation. In-memory records owned by the backend
2549 * represent data written after the truncation point on the
2550 * backend and must not be truncated.
2552 * Truncate operations deal with frontend buffer cache
2553 * buffers and frontend-owned in-memory records synchronously.
2555 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2556 if (hammer_cursor_ondisk(&cursor) ||
2557 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2558 if (ip->trunc_off <= rec_offset)
2559 n = 0;
2560 else if (ip->trunc_off < rec_offset + n)
2561 n = (int)(ip->trunc_off - rec_offset);
2564 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2565 if (hammer_cursor_ondisk(&cursor)) {
2566 if (ip->sync_trunc_off <= rec_offset)
2567 n = 0;
2568 else if (ip->sync_trunc_off < rec_offset + n)
2569 n = (int)(ip->sync_trunc_off - rec_offset);
2574 * Try to issue a direct read into our bio if possible,
2575 * otherwise resolve the element data into a hammer_buffer
2576 * and copy.
2578 * The buffer on-disk should be zerod past any real
2579 * truncation point, but may not be for any synthesized
2580 * truncation point from above.
2582 disk_offset = cursor.leaf->data_offset + roff;
2583 if (boff == 0 && n == bp->b_bufsize &&
2584 hammer_cursor_ondisk(&cursor) &&
2585 (disk_offset & HAMMER_BUFMASK) == 0) {
2586 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2587 HAMMER_ZONE_LARGE_DATA);
2588 nbio->bio_offset = disk_offset;
2589 error = hammer_io_direct_read(trans.hmp, nbio,
2590 cursor.leaf);
2591 goto done;
2592 } else if (n) {
2593 error = hammer_ip_resolve_data(&cursor);
2594 if (error == 0) {
2595 bcopy((char *)cursor.data + roff,
2596 (char *)bp->b_data + boff, n);
2599 if (error)
2600 break;
2603 * Iterate until we have filled the request.
2605 boff += n;
2606 if (boff == bp->b_bufsize)
2607 break;
2608 error = hammer_ip_next(&cursor);
2612 * There may have been a gap after the last record
2614 if (error == ENOENT)
2615 error = 0;
2616 if (error == 0 && boff != bp->b_bufsize) {
2617 KKASSERT(boff < bp->b_bufsize);
2618 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2619 /* boff = bp->b_bufsize; */
2621 bp->b_resid = 0;
2622 bp->b_error = error;
2623 if (error)
2624 bp->b_flags |= B_ERROR;
2625 biodone(ap->a_bio);
2627 done:
2629 * Cache the b-tree node for the last data read in cache[1].
2631 * If we hit the file EOF then also cache the node in the
2632 * governing director's cache[3], it will be used to initialize
2633 * the inode's cache[1] for any inodes looked up via the directory.
2635 * This doesn't reduce disk accesses since the B-Tree chain is
2636 * likely cached, but it does reduce cpu overhead when looking
2637 * up file offsets for cpdup/tar/cpio style iterations.
2639 if (cursor.node)
2640 hammer_cache_node(&ip->cache[1], cursor.node);
2641 if (ran_end >= ip->ino_data.size) {
2642 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2643 ip->obj_asof, ip->obj_localization);
2644 if (dip) {
2645 hammer_cache_node(&dip->cache[3], cursor.node);
2646 hammer_rel_inode(dip, 0);
2649 hammer_done_cursor(&cursor);
2650 hammer_done_transaction(&trans);
2651 return(error);
2655 * BMAP operation - used to support cluster_read() only.
2657 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2659 * This routine may return EOPNOTSUPP if the opration is not supported for
2660 * the specified offset. The contents of the pointer arguments do not
2661 * need to be initialized in that case.
2663 * If a disk address is available and properly aligned return 0 with
2664 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2665 * to the run-length relative to that offset. Callers may assume that
2666 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2667 * large, so return EOPNOTSUPP if it is not sufficiently large.
2669 static
2671 hammer_vop_bmap(struct vop_bmap_args *ap)
2673 struct hammer_transaction trans;
2674 struct hammer_inode *ip;
2675 struct hammer_cursor cursor;
2676 hammer_base_elm_t base;
2677 int64_t rec_offset;
2678 int64_t ran_end;
2679 int64_t tmp64;
2680 int64_t base_offset;
2681 int64_t base_disk_offset;
2682 int64_t last_offset;
2683 hammer_off_t last_disk_offset;
2684 hammer_off_t disk_offset;
2685 int rec_len;
2686 int error;
2687 int blksize;
2689 ++hammer_stats_file_iopsr;
2690 ip = ap->a_vp->v_data;
2693 * We can only BMAP regular files. We can't BMAP database files,
2694 * directories, etc.
2696 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2697 return(EOPNOTSUPP);
2700 * bmap is typically called with runp/runb both NULL when used
2701 * for writing. We do not support BMAP for writing atm.
2703 if (ap->a_cmd != BUF_CMD_READ)
2704 return(EOPNOTSUPP);
2707 * Scan the B-Tree to acquire blockmap addresses, then translate
2708 * to raw addresses.
2710 hammer_simple_transaction(&trans, ip->hmp);
2711 #if 0
2712 kprintf("bmap_beg %016llx ip->cache %p\n",
2713 (long long)ap->a_loffset, ip->cache[1]);
2714 #endif
2715 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2718 * Key range (begin and end inclusive) to scan. Note that the key's
2719 * stored in the actual records represent BASE+LEN, not BASE. The
2720 * first record containing bio_offset will have a key > bio_offset.
2722 cursor.key_beg.localization = ip->obj_localization +
2723 HAMMER_LOCALIZE_MISC;
2724 cursor.key_beg.obj_id = ip->obj_id;
2725 cursor.key_beg.create_tid = 0;
2726 cursor.key_beg.delete_tid = 0;
2727 cursor.key_beg.obj_type = 0;
2728 if (ap->a_runb)
2729 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2730 else
2731 cursor.key_beg.key = ap->a_loffset + 1;
2732 if (cursor.key_beg.key < 0)
2733 cursor.key_beg.key = 0;
2734 cursor.asof = ip->obj_asof;
2735 cursor.flags |= HAMMER_CURSOR_ASOF;
2737 cursor.key_end = cursor.key_beg;
2738 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2740 ran_end = ap->a_loffset + MAXPHYS;
2741 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2742 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2743 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2744 if (tmp64 < ran_end)
2745 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2746 else
2747 cursor.key_end.key = ran_end + MAXPHYS + 1;
2749 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2751 error = hammer_ip_first(&cursor);
2752 base_offset = last_offset = 0;
2753 base_disk_offset = last_disk_offset = 0;
2755 while (error == 0) {
2757 * Get the base file offset of the record. The key for
2758 * data records is (base + bytes) rather then (base).
2760 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2761 * The extra bytes should be zero on-disk and the BMAP op
2762 * should still be ok.
2764 base = &cursor.leaf->base;
2765 rec_offset = base->key - cursor.leaf->data_len;
2766 rec_len = cursor.leaf->data_len;
2769 * Incorporate any cached truncation.
2771 * NOTE: Modifications to rec_len based on synthesized
2772 * truncation points remove the guarantee that any extended
2773 * data on disk is zero (since the truncations may not have
2774 * taken place on-media yet).
2776 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2777 if (hammer_cursor_ondisk(&cursor) ||
2778 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2779 if (ip->trunc_off <= rec_offset)
2780 rec_len = 0;
2781 else if (ip->trunc_off < rec_offset + rec_len)
2782 rec_len = (int)(ip->trunc_off - rec_offset);
2785 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2786 if (hammer_cursor_ondisk(&cursor)) {
2787 if (ip->sync_trunc_off <= rec_offset)
2788 rec_len = 0;
2789 else if (ip->sync_trunc_off < rec_offset + rec_len)
2790 rec_len = (int)(ip->sync_trunc_off - rec_offset);
2795 * Accumulate information. If we have hit a discontiguous
2796 * block reset base_offset unless we are already beyond the
2797 * requested offset. If we are, that's it, we stop.
2799 if (error)
2800 break;
2801 if (hammer_cursor_ondisk(&cursor)) {
2802 disk_offset = cursor.leaf->data_offset;
2803 if (rec_offset != last_offset ||
2804 disk_offset != last_disk_offset) {
2805 if (rec_offset > ap->a_loffset)
2806 break;
2807 base_offset = rec_offset;
2808 base_disk_offset = disk_offset;
2810 last_offset = rec_offset + rec_len;
2811 last_disk_offset = disk_offset + rec_len;
2813 error = hammer_ip_next(&cursor);
2816 #if 0
2817 kprintf("BMAP %016llx: %016llx - %016llx\n",
2818 (long long)ap->a_loffset,
2819 (long long)base_offset,
2820 (long long)last_offset);
2821 kprintf("BMAP %16s: %016llx - %016llx\n", "",
2822 (long long)base_disk_offset,
2823 (long long)last_disk_offset);
2824 #endif
2826 if (cursor.node) {
2827 hammer_cache_node(&ip->cache[1], cursor.node);
2828 #if 0
2829 kprintf("bmap_end2 %016llx ip->cache %p\n",
2830 (long long)ap->a_loffset, ip->cache[1]);
2831 #endif
2833 hammer_done_cursor(&cursor);
2834 hammer_done_transaction(&trans);
2837 * If we couldn't find any records or the records we did find were
2838 * all behind the requested offset, return failure. A forward
2839 * truncation can leave a hole w/ no on-disk records.
2841 if (last_offset == 0 || last_offset < ap->a_loffset)
2842 return (EOPNOTSUPP);
2845 * Figure out the block size at the requested offset and adjust
2846 * our limits so the cluster_read() does not create inappropriately
2847 * sized buffer cache buffers.
2849 blksize = hammer_blocksize(ap->a_loffset);
2850 if (hammer_blocksize(base_offset) != blksize) {
2851 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2853 if (last_offset != ap->a_loffset &&
2854 hammer_blocksize(last_offset - 1) != blksize) {
2855 last_offset = hammer_blockdemarc(ap->a_loffset,
2856 last_offset - 1);
2860 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2861 * from occuring.
2863 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2865 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2867 * Only large-data zones can be direct-IOd
2869 error = EOPNOTSUPP;
2870 } else if ((disk_offset & HAMMER_BUFMASK) ||
2871 (last_offset - ap->a_loffset) < blksize) {
2873 * doffsetp is not aligned or the forward run size does
2874 * not cover a whole buffer, disallow the direct I/O.
2876 error = EOPNOTSUPP;
2877 } else {
2879 * We're good.
2881 *ap->a_doffsetp = disk_offset;
2882 if (ap->a_runb) {
2883 *ap->a_runb = ap->a_loffset - base_offset;
2884 KKASSERT(*ap->a_runb >= 0);
2886 if (ap->a_runp) {
2887 *ap->a_runp = last_offset - ap->a_loffset;
2888 KKASSERT(*ap->a_runp >= 0);
2890 error = 0;
2892 return(error);
2896 * Write to a regular file. Because this is a strategy call the OS is
2897 * trying to actually get data onto the media.
2899 static
2901 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2903 hammer_record_t record;
2904 hammer_mount_t hmp;
2905 hammer_inode_t ip;
2906 struct bio *bio;
2907 struct buf *bp;
2908 int blksize;
2909 int bytes;
2910 int error;
2912 bio = ap->a_bio;
2913 bp = bio->bio_buf;
2914 ip = ap->a_vp->v_data;
2915 hmp = ip->hmp;
2917 blksize = hammer_blocksize(bio->bio_offset);
2918 KKASSERT(bp->b_bufsize == blksize);
2920 if (ip->flags & HAMMER_INODE_RO) {
2921 bp->b_error = EROFS;
2922 bp->b_flags |= B_ERROR;
2923 biodone(ap->a_bio);
2924 return(EROFS);
2928 * Interlock with inode destruction (no in-kernel or directory
2929 * topology visibility). If we queue new IO while trying to
2930 * destroy the inode we can deadlock the vtrunc call in
2931 * hammer_inode_unloadable_check().
2933 * Besides, there's no point flushing a bp associated with an
2934 * inode that is being destroyed on-media and has no kernel
2935 * references.
2937 if ((ip->flags | ip->sync_flags) &
2938 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2939 bp->b_resid = 0;
2940 biodone(ap->a_bio);
2941 return(0);
2945 * Reserve space and issue a direct-write from the front-end.
2946 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2947 * allocations.
2949 * An in-memory record will be installed to reference the storage
2950 * until the flusher can get to it.
2952 * Since we own the high level bio the front-end will not try to
2953 * do a direct-read until the write completes.
2955 * NOTE: The only time we do not reserve a full-sized buffers
2956 * worth of data is if the file is small. We do not try to
2957 * allocate a fragment (from the small-data zone) at the end of
2958 * an otherwise large file as this can lead to wildly separated
2959 * data.
2961 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2962 KKASSERT(bio->bio_offset < ip->ino_data.size);
2963 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2964 bytes = bp->b_bufsize;
2965 else
2966 bytes = ((int)ip->ino_data.size + 15) & ~15;
2968 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2969 bytes, &error);
2970 if (record) {
2971 hammer_io_direct_write(hmp, record, bio);
2972 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2973 hammer_flush_inode(ip, 0);
2974 } else {
2975 bp->b_bio2.bio_offset = NOOFFSET;
2976 bp->b_error = error;
2977 bp->b_flags |= B_ERROR;
2978 biodone(ap->a_bio);
2980 return(error);
2984 * dounlink - disconnect a directory entry
2986 * XXX whiteout support not really in yet
2988 static int
2989 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2990 struct vnode *dvp, struct ucred *cred,
2991 int flags, int isdir)
2993 struct namecache *ncp;
2994 hammer_inode_t dip;
2995 hammer_inode_t ip;
2996 struct hammer_cursor cursor;
2997 int64_t namekey;
2998 u_int32_t max_iterations;
2999 int nlen, error;
3002 * Calculate the namekey and setup the key range for the scan. This
3003 * works kinda like a chained hash table where the lower 32 bits
3004 * of the namekey synthesize the chain.
3006 * The key range is inclusive of both key_beg and key_end.
3008 dip = VTOI(dvp);
3009 ncp = nch->ncp;
3011 if (dip->flags & HAMMER_INODE_RO)
3012 return (EROFS);
3014 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3015 &max_iterations);
3016 retry:
3017 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3018 cursor.key_beg.localization = dip->obj_localization +
3019 hammer_dir_localization(dip);
3020 cursor.key_beg.obj_id = dip->obj_id;
3021 cursor.key_beg.key = namekey;
3022 cursor.key_beg.create_tid = 0;
3023 cursor.key_beg.delete_tid = 0;
3024 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3025 cursor.key_beg.obj_type = 0;
3027 cursor.key_end = cursor.key_beg;
3028 cursor.key_end.key += max_iterations;
3029 cursor.asof = dip->obj_asof;
3030 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3033 * Scan all matching records (the chain), locate the one matching
3034 * the requested path component. info->last_error contains the
3035 * error code on search termination and could be 0, ENOENT, or
3036 * something else.
3038 * The hammer_ip_*() functions merge in-memory records with on-disk
3039 * records for the purposes of the search.
3041 error = hammer_ip_first(&cursor);
3043 while (error == 0) {
3044 error = hammer_ip_resolve_data(&cursor);
3045 if (error)
3046 break;
3047 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3048 KKASSERT(nlen > 0);
3049 if (ncp->nc_nlen == nlen &&
3050 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3051 break;
3053 error = hammer_ip_next(&cursor);
3057 * If all is ok we have to get the inode so we can adjust nlinks.
3058 * To avoid a deadlock with the flusher we must release the inode
3059 * lock on the directory when acquiring the inode for the entry.
3061 * If the target is a directory, it must be empty.
3063 if (error == 0) {
3064 hammer_unlock(&cursor.ip->lock);
3065 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3066 dip->hmp->asof,
3067 cursor.data->entry.localization,
3068 0, &error);
3069 hammer_lock_sh(&cursor.ip->lock);
3070 if (error == ENOENT) {
3071 kprintf("HAMMER: WARNING: Removing "
3072 "dirent w/missing inode \"%s\"\n"
3073 "\tobj_id = %016llx\n",
3074 ncp->nc_name,
3075 (long long)cursor.data->entry.obj_id);
3076 error = 0;
3080 * If isdir >= 0 we validate that the entry is or is not a
3081 * directory. If isdir < 0 we don't care.
3083 if (error == 0 && isdir >= 0 && ip) {
3084 if (isdir &&
3085 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3086 error = ENOTDIR;
3087 } else if (isdir == 0 &&
3088 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3089 error = EISDIR;
3094 * If we are trying to remove a directory the directory must
3095 * be empty.
3097 * The check directory code can loop and deadlock/retry. Our
3098 * own cursor's node locks must be released to avoid a 3-way
3099 * deadlock with the flusher if the check directory code
3100 * blocks.
3102 * If any changes whatsoever have been made to the cursor
3103 * set EDEADLK and retry.
3105 * WARNING: See warnings in hammer_unlock_cursor()
3106 * function.
3108 if (error == 0 && ip && ip->ino_data.obj_type ==
3109 HAMMER_OBJTYPE_DIRECTORY) {
3110 hammer_unlock_cursor(&cursor);
3111 error = hammer_ip_check_directory_empty(trans, ip);
3112 hammer_lock_cursor(&cursor);
3113 if (cursor.flags & HAMMER_CURSOR_RETEST) {
3114 kprintf("HAMMER: Warning: avoided deadlock "
3115 "on rmdir '%s'\n",
3116 ncp->nc_name);
3117 error = EDEADLK;
3122 * Delete the directory entry.
3124 * WARNING: hammer_ip_del_directory() may have to terminate
3125 * the cursor to avoid a deadlock. It is ok to call
3126 * hammer_done_cursor() twice.
3128 if (error == 0) {
3129 error = hammer_ip_del_directory(trans, &cursor,
3130 dip, ip);
3132 hammer_done_cursor(&cursor);
3133 if (error == 0) {
3134 cache_setunresolved(nch);
3135 cache_setvp(nch, NULL);
3136 /* XXX locking */
3137 if (ip && ip->vp) {
3138 hammer_knote(ip->vp, NOTE_DELETE);
3139 cache_inval_vp(ip->vp, CINV_DESTROY);
3142 if (ip)
3143 hammer_rel_inode(ip, 0);
3144 } else {
3145 hammer_done_cursor(&cursor);
3147 if (error == EDEADLK)
3148 goto retry;
3150 return (error);
3153 /************************************************************************
3154 * FIFO AND SPECFS OPS *
3155 ************************************************************************
3159 static int
3160 hammer_vop_fifoclose (struct vop_close_args *ap)
3162 /* XXX update itimes */
3163 return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3166 static int
3167 hammer_vop_fiforead (struct vop_read_args *ap)
3169 int error;
3171 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3172 /* XXX update access time */
3173 return (error);
3176 static int
3177 hammer_vop_fifowrite (struct vop_write_args *ap)
3179 int error;
3181 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3182 /* XXX update access time */
3183 return (error);
3186 static
3188 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3190 int error;
3192 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3193 if (error)
3194 error = hammer_vop_kqfilter(ap);
3195 return(error);
3198 /************************************************************************
3199 * KQFILTER OPS *
3200 ************************************************************************
3203 static void filt_hammerdetach(struct knote *kn);
3204 static int filt_hammerread(struct knote *kn, long hint);
3205 static int filt_hammerwrite(struct knote *kn, long hint);
3206 static int filt_hammervnode(struct knote *kn, long hint);
3208 static struct filterops hammerread_filtops =
3209 { 1, NULL, filt_hammerdetach, filt_hammerread };
3210 static struct filterops hammerwrite_filtops =
3211 { 1, NULL, filt_hammerdetach, filt_hammerwrite };
3212 static struct filterops hammervnode_filtops =
3213 { 1, NULL, filt_hammerdetach, filt_hammervnode };
3215 static
3217 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3219 struct vnode *vp = ap->a_vp;
3220 struct knote *kn = ap->a_kn;
3221 lwkt_tokref vlock;
3223 switch (kn->kn_filter) {
3224 case EVFILT_READ:
3225 kn->kn_fop = &hammerread_filtops;
3226 break;
3227 case EVFILT_WRITE:
3228 kn->kn_fop = &hammerwrite_filtops;
3229 break;
3230 case EVFILT_VNODE:
3231 kn->kn_fop = &hammervnode_filtops;
3232 break;
3233 default:
3234 return (1);
3237 kn->kn_hook = (caddr_t)vp;
3239 lwkt_gettoken(&vlock, &vp->v_token);
3240 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3241 lwkt_reltoken(&vlock);
3243 return(0);
3246 static void
3247 filt_hammerdetach(struct knote *kn)
3249 struct vnode *vp = (void *)kn->kn_hook;
3250 lwkt_tokref vlock;
3252 lwkt_gettoken(&vlock, &vp->v_token);
3253 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3254 kn, knote, kn_selnext);
3255 lwkt_reltoken(&vlock);
3258 static int
3259 filt_hammerread(struct knote *kn, long hint)
3261 struct vnode *vp = (void *)kn->kn_hook;
3262 hammer_inode_t ip = VTOI(vp);
3264 if (hint == NOTE_REVOKE) {
3265 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3266 return(1);
3268 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3269 return (kn->kn_data != 0);
3272 static int
3273 filt_hammerwrite(struct knote *kn, long hint)
3275 if (hint == NOTE_REVOKE)
3276 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3277 kn->kn_data = 0;
3278 return (1);
3281 static int
3282 filt_hammervnode(struct knote *kn, long hint)
3284 if (kn->kn_sfflags & hint)
3285 kn->kn_fflags |= hint;
3286 if (hint == NOTE_REVOKE) {
3287 kn->kn_flags |= EV_EOF;
3288 return (1);
3290 return (kn->kn_fflags != 0);