2 * Copyright (c) 2000-2003 Tor Egge
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * $FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.3.2.2 2003/05/29 06:15:35 alc Exp $
27 * $DragonFly: src/sys/vfs/ufs/ffs_rawread.c,v 1.28 2008/06/19 23:27:39 dillon Exp $
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/fcntl.h>
36 #include <sys/mount.h>
37 #include <sys/namei.h>
38 #include <sys/vnode.h>
40 #include <sys/filio.h>
41 #include <sys/ttycom.h>
47 #include <machine/limits.h>
49 #include <vm/vm_extern.h>
50 #include <vm/vm_object.h>
51 #include <sys/kernel.h>
52 #include <sys/sysctl.h>
54 static int ffs_rawread_readahead(struct vnode
*vp
, caddr_t udata
, off_t offset
,
55 size_t len
, struct buf
*bp
);
56 static int ffs_rawread_main(struct vnode
*vp
,
59 static int ffs_rawread_sync(struct vnode
*vp
);
61 int ffs_rawread(struct vnode
*vp
, struct uio
*uio
, int *workdone
);
63 void ffs_rawread_setup(void);
65 SYSCTL_DECL(_vfs_ffs
);
67 static int ffsrawbufcnt
= 4;
68 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, ffsrawbufcnt
, CTLFLAG_RD
, &ffsrawbufcnt
, 0,
69 "Buffers available for raw reads");
71 static int allowrawread
= 1;
72 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, allowrawread
, CTLFLAG_RW
, &allowrawread
, 0,
73 "Flag to enable raw reads");
75 static int rawreadahead
= 1;
76 SYSCTL_INT(_vfs_ffs
, OID_AUTO
, rawreadahead
, CTLFLAG_RW
, &rawreadahead
, 0,
77 "Flag to enable readahead for long raw reads");
81 ffs_rawread_setup(void)
83 ffsrawbufcnt
= (nswbuf_kva
> 100 ) ?
84 (nswbuf_kva
- (nswbuf_kva
>> 4)) : nswbuf_kva
- 8;
89 ffs_rawread_sync(struct vnode
*vp
)
94 * Check for dirty mmap, pending writes and dirty buffers
96 lwkt_gettoken(&vp
->v_token
);
97 if (bio_track_active(&vp
->v_track_write
) ||
98 !RB_EMPTY(&vp
->v_rbdirty_tree
) ||
99 (vp
->v_flag
& VOBJDIRTY
) != 0) {
100 /* Attempt to msync mmap() regions to clean dirty mmap */
101 if ((vp
->v_flag
& VOBJDIRTY
) != 0) {
102 struct vm_object
*obj
;
103 if ((obj
= vp
->v_object
) != NULL
)
104 vm_object_page_clean(obj
, 0, 0, OBJPC_SYNC
);
107 /* Wait for pending writes to complete */
108 error
= bio_track_wait(&vp
->v_track_write
, 0, 0);
112 /* Flush dirty buffers */
113 if (!RB_EMPTY(&vp
->v_rbdirty_tree
)) {
114 if ((error
= VOP_FSYNC(vp
, MNT_WAIT
, 0)) != 0) {
117 if (bio_track_active(&vp
->v_track_write
) ||
118 !RB_EMPTY(&vp
->v_rbdirty_tree
))
119 panic("ffs_rawread_sync: dirty bufs");
125 lwkt_reltoken(&vp
->v_token
);
131 ffs_rawread_readahead(struct vnode
*vp
, caddr_t udata
, off_t loffset
,
132 size_t len
, struct buf
*bp
)
141 bsize
= vp
->v_mount
->mnt_stat
.f_iosize
;
144 * Make sure it fits into the pbuf
146 iolen
= (int)(intptr_t)udata
& PAGE_MASK
;
147 if (len
+ iolen
> bp
->b_kvasize
) {
154 * Raw disk address is in bio2, but we wait for it to
157 bp
->b_flags
&= ~B_ERROR
;
158 bp
->b_loffset
= loffset
;
159 bp
->b_bio2
.bio_offset
= NOOFFSET
;
160 bp
->b_bio1
.bio_done
= biodone_sync
;
161 bp
->b_bio1
.bio_flags
|= BIO_SYNC
;
163 blockoff
= (loffset
% bsize
) / DEV_BSIZE
;
165 error
= VOP_BMAP(vp
, bp
->b_loffset
, &bp
->b_bio2
.bio_offset
,
166 &bforwards
, NULL
, BUF_CMD_READ
);
169 dp
= VTOI(vp
)->i_devvp
;
170 if (bp
->b_bio2
.bio_offset
== NOOFFSET
) {
172 * Fill holes with NULs to preserve semantics
174 if (len
+ blockoff
* DEV_BSIZE
> bsize
)
175 len
= bsize
- blockoff
* DEV_BSIZE
;
177 if (vmapbuf(bp
, udata
, len
) < 0)
181 bzero(bp
->b_data
, bp
->b_bcount
);
183 /* Mark operation completed (similar to bufdone()) */
189 if (len
+ blockoff
* DEV_BSIZE
> bforwards
)
190 len
= bforwards
- blockoff
* DEV_BSIZE
;
191 bp
->b_bio2
.bio_offset
+= blockoff
* DEV_BSIZE
;
193 if (vmapbuf(bp
, udata
, len
) < 0)
197 * Access the block device layer using the device vnode (dp) and
198 * the translated block number (bio2) instead of the logical block
201 * Even though we are bypassing the vnode layer, we still
202 * want the vnode state to indicate that an I/O on its behalf
205 bp
->b_cmd
= BUF_CMD_READ
;
206 bio_start_transaction(&bp
->b_bio1
, &vp
->v_track_read
);
207 vn_strategy(dp
, &bp
->b_bio2
);
212 ffs_rawread_main(struct vnode
*vp
, struct uio
*uio
)
215 struct buf
*bp
, *nbp
, *tbp
;
221 udata
= uio
->uio_iov
->iov_base
;
222 resid
= uio
->uio_resid
;
223 offset
= uio
->uio_offset
;
233 if (bp
== NULL
) { /* Setup first read */
234 /* XXX: Leave some bufs for swap */
235 bp
= getpbuf_kva(&ffsrawbufcnt
);
236 error
= ffs_rawread_readahead(vp
, udata
, offset
,
241 if (resid
> bp
->b_bufsize
) { /* Setup fist readahead */
242 /* XXX: Leave bufs for swap */
243 if (rawreadahead
!= 0)
244 nbp
= trypbuf_kva(&ffsrawbufcnt
);
248 nerror
= ffs_rawread_readahead(
250 udata
+ bp
->b_bufsize
,
251 offset
+ bp
->b_bufsize
,
252 resid
- bp
->b_bufsize
,
255 relpbuf(nbp
, &ffsrawbufcnt
);
262 biowait(&bp
->b_bio1
, "rawrd");
266 iolen
= bp
->b_bcount
- bp
->b_resid
;
267 if (iolen
== 0 && (bp
->b_flags
& B_ERROR
) == 0) {
268 nerror
= 0; /* Ignore possible beyond EOF error */
272 if ((bp
->b_flags
& B_ERROR
) != 0) {
276 clearbiocache(&bp
->b_bio2
);
280 if (iolen
< bp
->b_bufsize
) {
281 /* Incomplete read. Try to read remaining part */
282 error
= ffs_rawread_readahead(
284 bp
->b_bufsize
- iolen
, bp
);
287 } else if (nbp
!= NULL
) { /* Complete read with readahead */
293 clearbiocache(&nbp
->b_bio2
);
295 if (resid
<= bp
->b_bufsize
) { /* No more readaheads */
296 relpbuf(nbp
, &ffsrawbufcnt
);
298 } else { /* Setup next readahead */
299 nerror
= ffs_rawread_readahead(
300 vp
, udata
+ bp
->b_bufsize
,
301 offset
+ bp
->b_bufsize
,
302 resid
- bp
->b_bufsize
,
305 relpbuf(nbp
, &ffsrawbufcnt
);
309 } else if (nerror
!= 0) {/* Deferred Readahead error */
311 } else if (resid
> 0) { /* More to read, no readahead */
312 error
= ffs_rawread_readahead(vp
, udata
, offset
,
320 relpbuf(bp
, &ffsrawbufcnt
);
321 if (nbp
!= NULL
) { /* Run down readahead buffer */
322 biowait(&nbp
->b_bio1
, "rawrd");
324 relpbuf(nbp
, &ffsrawbufcnt
);
329 uio
->uio_iov
->iov_base
= udata
;
330 uio
->uio_resid
= resid
;
331 uio
->uio_offset
= offset
;
337 ffs_rawread(struct vnode
*vp
,
341 if (allowrawread
!= 0 &&
342 uio
->uio_iovcnt
== 1 &&
343 uio
->uio_segflg
== UIO_USERSPACE
&&
344 uio
->uio_resid
== uio
->uio_iov
->iov_len
&&
345 (curthread
->td_flags
& TDF_DEADLKTREAT
) == 0) {
346 int secsize
; /* Media sector size */
347 off_t filebytes
; /* Bytes left of file */
348 int blockbytes
; /* Bytes left of file in full blocks */
349 int partialbytes
; /* Bytes in last partial block */
350 int skipbytes
; /* Bytes not to read in ffs_rawread */
355 /* Only handle sector aligned reads */
357 secsize
= ip
->i_devvp
->v_rdev
->si_bsize_phys
;
358 if ((uio
->uio_offset
& (secsize
- 1)) == 0 &&
359 (uio
->uio_resid
& (secsize
- 1)) == 0) {
361 /* Sync dirty pages and buffers if needed */
362 error
= ffs_rawread_sync(vp
);
366 /* Check for end of file */
367 if (ip
->i_size
> uio
->uio_offset
) {
368 filebytes
= ip
->i_size
- uio
->uio_offset
;
370 /* No special eof handling needed ? */
371 if (uio
->uio_resid
<= filebytes
) {
373 return ffs_rawread_main(vp
, uio
);
376 partialbytes
= ((unsigned int) ip
->i_size
) %
378 blockbytes
= (int) filebytes
- partialbytes
;
379 if (blockbytes
> 0) {
380 skipbytes
= uio
->uio_resid
-
382 uio
->uio_resid
= blockbytes
;
383 error
= ffs_rawread_main(vp
, uio
);
384 uio
->uio_resid
+= skipbytes
;
387 /* Read remaining part using buffer */