2 * Copyright (c) 2000-2003 Tor Egge
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.3.2.2 2003/05/29 06:15:35 alc Exp $
27 * $DragonFly: src/sys/vfs/ufs/ffs_rawread.c,v 1.26 2006/08/12 00:26:21 dillon Exp $
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/fcntl.h>
36 #include <sys/mount.h>
37 #include <sys/namei.h>
38 #include <sys/vnode.h>
40 #include <sys/filio.h>
41 #include <sys/ttycom.h>
47 #include <machine/limits.h>
49 #include <vm/vm_extern.h>
50 #include <vm/vm_object.h>
51 #include <sys/kernel.h>
52 #include <sys/sysctl.h>
54 static int ffs_rawread_readahead(struct vnode
*vp
, caddr_t udata
, off_t offset
,
55 size_t len
, struct buf
*bp
, int *baseticks
);
56 static int ffs_rawread_main(struct vnode
*vp
,
59 static int ffs_rawread_sync(struct vnode
*vp
);
61 int ffs_rawread(struct vnode
*vp
, struct uio
*uio
, int *workdone
);
63 void ffs_rawread_setup(void);
65 static void ffs_rawreadwakeup(struct bio
*bio
);
68 SYSCTL_DECL(_vfs_ffs
);
70 static int ffsrawbufcnt
= 4;
71 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, ffsrawbufcnt
, CTLFLAG_RD
, &ffsrawbufcnt
, 0,
72 "Buffers available for raw reads");
74 static int allowrawread
= 1;
75 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, allowrawread
, CTLFLAG_RW
, &allowrawread
, 0,
76 "Flag to enable raw reads");
78 static int rawreadahead
= 1;
79 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, rawreadahead
, CTLFLAG_RW
, &rawreadahead
, 0,
80 "Flag to enable readahead for long raw reads");
84 ffs_rawread_setup(void)
86 ffsrawbufcnt
= (nswbuf
> 100 ) ? (nswbuf
- (nswbuf
>> 4)) : nswbuf
- 8;
91 ffs_rawread_sync(struct vnode
*vp
)
96 /* Check for dirty mmap, pending writes and dirty buffers */
98 if (vp
->v_track_write
.bk_active
> 0 ||
99 !RB_EMPTY(&vp
->v_rbdirty_tree
) ||
100 (vp
->v_flag
& VOBJDIRTY
) != 0) {
103 if (vn_islocked(vp
) != LK_EXCLUSIVE
) {
105 /* Upgrade to exclusive lock, this might block */
106 vn_lock(vp
, LK_UPGRADE
);
110 /* Attempt to msync mmap() regions to clean dirty mmap */
111 if ((vp
->v_flag
& VOBJDIRTY
) != 0) {
112 struct vm_object
*obj
;
113 if ((obj
= vp
->v_object
) != NULL
)
114 vm_object_page_clean(obj
, 0, 0, OBJPC_SYNC
);
117 /* Wait for pending writes to complete */
119 while (vp
->v_track_write
.bk_active
) {
120 vp
->v_track_write
.bk_waitflag
= 1;
121 error
= tsleep(&vp
->v_track_write
, 0, "rawrdfls", 0);
125 vn_lock(vp
, LK_DOWNGRADE
);
129 /* Flush dirty buffers */
130 if (!RB_EMPTY(&vp
->v_rbdirty_tree
)) {
132 if ((error
= VOP_FSYNC(vp
, MNT_WAIT
)) != 0) {
134 vn_lock(vp
, LK_DOWNGRADE
);
138 if (vp
->v_track_write
.bk_active
> 0 ||
139 !RB_EMPTY(&vp
->v_rbdirty_tree
))
140 panic("ffs_rawread_sync: dirty bufs");
144 vn_lock(vp
, LK_DOWNGRADE
);
153 ffs_rawread_readahead(struct vnode
*vp
, caddr_t udata
, off_t loffset
,
154 size_t len
, struct buf
*bp
, int *baseticks
)
163 bsize
= vp
->v_mount
->mnt_stat
.f_iosize
;
166 * Make sure it fits into the pbuf
168 iolen
= (int)(intptr_t)udata
& PAGE_MASK
;
169 if (len
+ iolen
> bp
->b_kvasize
) {
174 bp
->b_flags
&= ~B_ERROR
;
175 bp
->b_loffset
= loffset
;
176 bp
->b_bio2
.bio_offset
= NOOFFSET
;
177 bp
->b_bio2
.bio_done
= ffs_rawreadwakeup
;
179 blockoff
= (loffset
% bsize
) / DEV_BSIZE
;
181 error
= VOP_BMAP(vp
, bp
->b_loffset
, &dp
, &bp
->b_bio2
.bio_offset
,
186 if (bp
->b_bio2
.bio_offset
== NOOFFSET
) {
188 * Fill holes with NULs to preserve semantics
190 if (len
+ blockoff
* DEV_BSIZE
> bsize
)
191 len
= bsize
- blockoff
* DEV_BSIZE
;
193 if (vmapbuf(bp
, udata
, len
) < 0)
196 if (ticks
- *baseticks
>= hogticks
) {
200 bzero(bp
->b_data
, bp
->b_bcount
);
202 /* Mark operation completed (similar to bufdone()) */
208 if (len
+ blockoff
* DEV_BSIZE
> bforwards
)
209 len
= bforwards
- blockoff
* DEV_BSIZE
;
210 bp
->b_bio2
.bio_offset
+= blockoff
* DEV_BSIZE
;
212 if (vmapbuf(bp
, udata
, len
) < 0)
216 * Access the block device layer using the device vnode (dp) and
217 * the translated block number (bio2) instead of the logical block
220 * Even though we are bypassing the vnode layer, we still
221 * want the vnode state to indicate that an I/O on its behalf
224 bp
->b_cmd
= BUF_CMD_READ
;
225 bio_start_transaction(&bp
->b_bio1
, &vp
->v_track_read
);
226 vn_strategy(dp
, &bp
->b_bio2
);
231 ffs_rawread_main(struct vnode
*vp
, struct uio
*uio
)
234 struct buf
*bp
, *nbp
, *tbp
;
236 int baseticks
= ticks
;
241 udata
= uio
->uio_iov
->iov_base
;
242 resid
= uio
->uio_resid
;
243 offset
= uio
->uio_offset
;
253 if (bp
== NULL
) { /* Setup first read */
254 /* XXX: Leave some bufs for swap */
255 bp
= getpbuf(&ffsrawbufcnt
);
256 error
= ffs_rawread_readahead(vp
, udata
, offset
, resid
,
261 if (resid
> bp
->b_bufsize
) { /* Setup fist readahead */
262 /* XXX: Leave bufs for swap */
263 if (rawreadahead
!= 0)
264 nbp
= trypbuf(&ffsrawbufcnt
);
268 nerror
= ffs_rawread_readahead(
270 udata
+ bp
->b_bufsize
,
271 offset
+ bp
->b_bufsize
,
272 resid
- bp
->b_bufsize
,
275 relpbuf(nbp
, &ffsrawbufcnt
);
283 while (bp
->b_cmd
!= BUF_CMD_DONE
)
284 tsleep((caddr_t
)&bp
->b_bio2
, 0, "rawrd", 0);
289 iolen
= bp
->b_bcount
- bp
->b_resid
;
290 if (iolen
== 0 && (bp
->b_flags
& B_ERROR
) == 0) {
291 nerror
= 0; /* Ignore possible beyond EOF error */
295 if ((bp
->b_flags
& B_ERROR
) != 0) {
299 clearbiocache(&bp
->b_bio2
);
303 if (iolen
< bp
->b_bufsize
) {
304 /* Incomplete read. Try to read remaining part */
305 error
= ffs_rawread_readahead(
307 bp
->b_bufsize
- iolen
, bp
, &baseticks
);
310 } else if (nbp
!= NULL
) { /* Complete read with readahead */
316 clearbiocache(&nbp
->b_bio2
);
318 if (resid
<= bp
->b_bufsize
) { /* No more readaheads */
319 relpbuf(nbp
, &ffsrawbufcnt
);
321 } else { /* Setup next readahead */
322 nerror
= ffs_rawread_readahead(
323 vp
, udata
+ bp
->b_bufsize
,
324 offset
+ bp
->b_bufsize
,
325 resid
- bp
->b_bufsize
,
328 relpbuf(nbp
, &ffsrawbufcnt
);
332 } else if (nerror
!= 0) {/* Deferred Readahead error */
334 } else if (resid
> 0) { /* More to read, no readahead */
335 error
= ffs_rawread_readahead(vp
, udata
, offset
,
344 relpbuf(bp
, &ffsrawbufcnt
);
345 if (nbp
!= NULL
) { /* Run down readahead buffer */
347 while (nbp
->b_cmd
!= BUF_CMD_DONE
)
348 tsleep(&nbp
->b_bio2
, 0, "rawrd", 0);
351 relpbuf(nbp
, &ffsrawbufcnt
);
356 uio
->uio_iov
->iov_base
= udata
;
357 uio
->uio_resid
= resid
;
358 uio
->uio_offset
= offset
;
364 ffs_rawread(struct vnode
*vp
,
368 if (allowrawread
!= 0 &&
369 uio
->uio_iovcnt
== 1 &&
370 uio
->uio_segflg
== UIO_USERSPACE
&&
371 uio
->uio_resid
== uio
->uio_iov
->iov_len
&&
372 (curthread
->td_flags
& TDF_DEADLKTREAT
) == 0) {
373 int secsize
; /* Media sector size */
374 off_t filebytes
; /* Bytes left of file */
375 int blockbytes
; /* Bytes left of file in full blocks */
376 int partialbytes
; /* Bytes in last partial block */
377 int skipbytes
; /* Bytes not to read in ffs_rawread */
382 /* Only handle sector aligned reads */
384 secsize
= ip
->i_devvp
->v_rdev
->si_bsize_phys
;
385 if ((uio
->uio_offset
& (secsize
- 1)) == 0 &&
386 (uio
->uio_resid
& (secsize
- 1)) == 0) {
388 /* Sync dirty pages and buffers if needed */
389 error
= ffs_rawread_sync(vp
);
393 /* Check for end of file */
394 if (ip
->i_size
> uio
->uio_offset
) {
395 filebytes
= ip
->i_size
- uio
->uio_offset
;
397 /* No special eof handling needed ? */
398 if (uio
->uio_resid
<= filebytes
) {
400 return ffs_rawread_main(vp
, uio
);
403 partialbytes
= ((unsigned int) ip
->i_size
) %
405 blockbytes
= (int) filebytes
- partialbytes
;
406 if (blockbytes
> 0) {
407 skipbytes
= uio
->uio_resid
-
409 uio
->uio_resid
= blockbytes
;
410 error
= ffs_rawread_main(vp
, uio
);
411 uio
->uio_resid
+= skipbytes
;
414 /* Read remaining part using buffer */
425 ffs_rawreadwakeup(struct bio
*bio
)
427 bio
->bio_buf
->b_cmd
= BUF_CMD_DONE
;