4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
38 #include <sys/types.h>
39 #include <sys/t_lock.h>
40 #include <sys/param.h>
41 #include <sys/errno.h>
42 #include <sys/debug.h>
43 #include <sys/cmn_err.h>
45 #include <sys/sysmacros.h>
46 #include <sys/inline.h>
51 #include <sys/systm.h>
52 #include <sys/vmsystm.h>
53 #include <sys/cpuvar.h>
56 #include <sys/vnode.h>
61 #include <sys/vtrace.h>
62 #include <sys/tnf_probe.h>
63 #include <sys/fs/snode.h>
64 #include <sys/copyops.h>
73 #include <vm/seg_vn.h>
74 #include <vm/seg_kmem.h>
79 minphys(struct buf
*bp
)
81 if (bp
->b_bcount
> maxphys
)
82 bp
->b_bcount
= maxphys
;
86 * use kmem_cache_create for physio buffers. This has shown
87 * a better cache distribution compared to buffers on the
88 * stack. It also avoids semaphore construction/deconstruction
92 static struct kmem_cache
*physio_buf_cache
;
96 physio_buf_constructor(void *buf
, void *cdrarg
, int kmflags
)
98 bioinit((struct buf
*)buf
);
104 physio_buf_destructor(void *buf
, void *cdrarg
)
106 biofini((struct buf
*)buf
);
110 physio_bufs_init(void)
112 physio_buf_cache
= kmem_cache_create("physio_buf_cache",
113 sizeof (struct buf
), 0, physio_buf_constructor
,
114 physio_buf_destructor
, NULL
, NULL
, NULL
, 0);
120 * initiate raw I/O request
122 * allocate buf header if necessary
123 * adjust max size of each I/O request
124 * lock down user pages and verify access protections
125 * call driver's strategy routine to submit request
126 * wait for I/O completion
127 * unlock user pages and free allocated buf header
131 default_physio(int (*strat
)(struct buf
*), struct buf
*bp
, dev_t dev
,
132 int rw
, void (*mincnt
)(struct buf
*), struct uio
*uio
)
143 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_START
, "physio_start: bp %p", bp
);
146 CPU_STATS_ADD_K(sys
, phread
, 1);
148 CPU_STATS_ADD_K(sys
, phwrite
, 1);
151 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_GETBUF_START
,
152 "getbuf_start: bp %p", bp
);
155 bp
= kmem_cache_alloc(physio_buf_cache
, KM_SLEEP
);
160 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_GETBUF_END
, "getbuf_end: bp %p", bp
);
162 if (uio
->uio_segflg
== UIO_USERSPACE
) {
163 procp
= ttoproc(curthread
);
169 ASSERT(SEMA_HELD(&bp
->b_sem
));
172 * We need to prepare this buffer for the io:::start probe, including
173 * NULL'ing out the file, clearing the offset, and filling in the
180 (void) devopsp
[getmajor(dev
)]->devo_getinfo(NULL
,
181 DDI_INFO_DEVT2DEVINFO
, (void *)dev
, (void **)&bp
->b_dip
);
186 while (uio
->uio_iovcnt
> 0) {
192 while (iov
->iov_len
> 0) {
193 if (uio
->uio_resid
== 0)
195 if (uio
->uio_loffset
< 0) {
201 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
202 * which represents the maximum size that can be
203 * supported by the IO subsystem.
204 * XXX this code assumes a D_64BIT driver.
206 if (uio
->uio_loffset
> SPEC_MAXOFFSET_T
) {
211 bp
->b_flags
= B_BUSY
| B_PHYS
| rw
;
213 bp
->b_lblkno
= btodt(uio
->uio_loffset
);
216 * Don't count on b_addr remaining untouched by the
217 * code below (it may be reset because someone does
218 * a bp_mapin on the buffer) -- reset from the iov
219 * each time through, updating the iov's base address
222 a
= bp
->b_un
.b_addr
= iov
->iov_base
;
223 bp
->b_bcount
= MIN(iov
->iov_len
, uio
->uio_resid
);
227 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_LOCK_START
,
228 "as_pagelock_start: bp %p", bp
);
230 error
= as_pagelock(asp
, &pplist
, a
,
231 c
, rw
== B_READ
? S_WRITE
: S_READ
);
233 TRACE_0(TR_FAC_PHYSIO
, TR_PHYSIO_LOCK_END
,
237 bp
->b_flags
|= B_ERROR
;
239 bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
);
242 bp
->b_shadow
= pplist
;
243 if (pplist
!= NULL
) {
244 bp
->b_flags
|= B_SHADOW
;
247 DTRACE_IO1(start
, struct buf
*, bp
);
248 bp
->b_flags
|= B_STARTED
;
256 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_UNLOCK_START
,
257 "as_pageunlock_start: bp %p", bp
);
259 as_pageunlock(asp
, pplist
, a
, c
,
260 rw
== B_READ
? S_WRITE
: S_READ
);
262 TRACE_0(TR_FAC_PHYSIO
, TR_PHYSIO_UNLOCK_END
,
263 "as_pageunlock_end:");
269 uio
->uio_loffset
+= c
;
270 /* bp->b_resid - temp kludge for tape drives */
271 if (bp
->b_resid
|| error
)
274 bp
->b_flags
&= ~(B_BUSY
|B_WANTED
|B_PHYS
|B_SHADOW
);
275 /* bp->b_resid - temp kludge for tape drives */
276 if (bp
->b_resid
|| error
)
283 kmem_cache_free(physio_buf_cache
, bp
);
286 TRACE_1(TR_FAC_PHYSIO
, TR_PHYSIO_END
, "physio_end: bp %p", bp
);
292 * Returns 0 on success, or an error on failure.
294 * This function is no longer a part of the DDI/DKI.
295 * However, for compatibility, its interface should not
296 * be changed and it should not be removed from the kernel.
299 useracc(void *addr
, size_t count
, int access
)
303 prot
= PROT_USER
| ((access
== B_READ
) ? PROT_READ
: PROT_WRITE
);
304 return (as_checkprot(ttoproc(curthread
)->p_as
, addr
, count
, prot
));
307 #define MAX_MAPIN_PAGES 8
310 * This function temporarily "borrows" user pages for kernel use. If
311 * "cow" is on, it also sets up copy-on-write protection (only feasible
312 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
313 * pages from any changes by the user. The caller is responsible for
314 * unlocking and tearing down cow settings when it's done with the pages.
315 * For an example, see kcfree().
317 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
318 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
319 * kaddr != -1. On entering this function, cached_ppp contains a list
320 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
321 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
322 * the kernel map won't need to be reloaded again.
324 * For cow == 1, if the pages are anonymous pages, it also bumps the anon
325 * reference count, and change the user-mapping to read-only. This
326 * scheme should work on all types of segment drivers. But to be safe,
327 * we check against segvn here.
329 * Since this function is used to emulate copyin() semantic, it checks
330 * to make sure the user-mappings allow "user-read".
332 * On exit "lenp" contains the number of bytes successfully locked and
333 * mapped in. For the unsuccessful ones, the caller can fall back to
337 * ENOTSUP - operation like this is not supported either on this segment
338 * type, or on this platform type.
341 cow_mapin(struct as
*as
, caddr_t uaddr
, caddr_t kaddr
, struct page
**cached_ppp
,
342 struct anon
**app
, size_t *lenp
, int cow
)
347 page_t
*pp
, *ppp
[MAX_MAPIN_PAGES
];
350 size_t size
, total
= *lenp
;
356 AS_LOCK_ENTER(as
, RW_WRITER
);
357 seg
= as_findseg(as
, uaddr
, 0);
358 if ((seg
== NULL
) || ((base
= seg
->s_base
) > uaddr
) ||
359 (uaddr
+ total
) > base
+ seg
->s_size
) {
364 * The COW scheme should work for all segment types.
365 * But to be safe, we check against segvn.
367 if (seg
->s_ops
!= &segvn_ops
) {
370 } else if ((segop_gettype(seg
, uaddr
) & MAP_PRIVATE
) == 0) {
379 * If (cow), hat_softlock will also change the usr protection to RO.
380 * This is the first step toward setting up cow. Before we
381 * bump up an_refcnt, we can't allow any cow-fault on this
382 * address. Otherwise segvn_fault will change the protection back
383 * to RW upon seeing an_refcnt == 1.
384 * The solution is to hold the writer lock on "as".
386 res
= hat_softlock(hat
, uaddr
, &size
, &ppp
[0], cow
? HAT_COW
: 0);
389 size
= size
>> PAGESHIFT
;
396 * Another solution is to hold SE_EXCL on pp, and
397 * disable PROT_WRITE. This also works for MAP_SHARED
398 * segment. The disadvantage is that it locks the
399 * page from being used by anybody else.
401 ahm
= AH_MUTEX(pp
->p_vnode
, pp
->p_offset
);
403 *app
= swap_anon(pp
->p_vnode
, pp
->p_offset
);
405 * Since we are holding the as lock, this avoids a
406 * potential race with anon_decref. (segvn_unmap and
407 * segvn_free needs the as writer lock to do anon_free.)
411 if ((*app
)->an_refcnt
== 0)
413 * Consider the following senario (unlikey
416 * 2. we solftlock the page.
417 * 3. cow ocurrs on this addr. So a new ap,
418 * page and mapping is established on addr.
419 * 4. an_refcnt drops to 1 (segvn_faultpage
420 * -> anon_decref(oldap))
421 * 5. the last ref to ap also drops (from
422 * another as). It ends up blocked inside
423 * anon_decref trying to get page's excl lock.
424 * 6. Later kcfree unlocks the page, call
425 * anon_decref -> oops, ap is gone already.
427 * Holding as writer lock solves all problems.
438 if (kaddr
!= (caddr_t
)-1) {
439 if (pp
!= *cached_ppp
) {
440 if (*cached_ppp
== NULL
)
441 flags
= HAT_LOAD_LOCK
| HAT_NOSYNC
|
444 flags
= HAT_LOAD_REMAP
|
447 * In order to cache the kernel mapping after
448 * the user page is unlocked, we call
449 * hat_devload instead of hat_memload so
450 * that the kernel mapping we set up here is
451 * "invisible" to the rest of the world. This
452 * is not very pretty. But as long as the
453 * caller bears the responsibility of keeping
454 * cache consistency, we should be ok -
455 * HAT_NOCONSIST will get us a uncached
456 * mapping on VAC. hat_softlock will flush
457 * a VAC_WRITEBACK cache. Therefore the kaddr
458 * doesn't have to be of the same vcolor as
460 * The alternative is - change hat_devload
461 * to get a cached mapping. Allocate a kaddr
462 * with the same vcolor as uaddr. Then
463 * hat_softlock won't need to flush the VAC.
465 hat_devload(kas
.a_hat
, kaddr
, PAGESIZE
,
466 page_pptonum(pp
), PROT_READ
, flags
);
478 if (first
&& res
== FC_NOMAP
) {
480 * If the address is not mapped yet, we call as_fault to
481 * fault the pages in. We could've fallen back to copy and
482 * let it fault in the pages. But for a mapped file, we
483 * normally reference each page only once. For zero-copy to
484 * be of any use, we'd better fall in the page now and try
488 size
= size
<< PAGESHIFT
;
492 res
= as_fault(as
->a_hat
, as
, uaddr
, size
, F_INVAL
, S_READ
);
494 AS_LOCK_ENTER(as
, RW_WRITER
);
500 case FC_PROT
: /* Pretend we don't know about it. This will be */
501 /* caught by the caller when uiomove fails. */