usr/src/uts/common/os/vm_subr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26 /*        All Rights Reserved   */
  27
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37
  38 #include <sys/types.h>
  39 #include <sys/t_lock.h>
  40 #include <sys/param.h>
  41 #include <sys/errno.h>
  42 #include <sys/debug.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/kmem.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/inline.h>
  47 #include <sys/buf.h>
  48 #include <sys/uio.h>
  49 #include <sys/user.h>
  50 #include <sys/proc.h>
  51 #include <sys/systm.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/cpuvar.h>
  54 #include <sys/mman.h>
  55 #include <sys/cred.h>
  56 #include <sys/vnode.h>
  57 #include <sys/file.h>
  58 #include <sys/vm.h>
  59
  60 #include <sys/swap.h>
  61 #include <sys/vtrace.h>
  62 #include <sys/tnf_probe.h>
  63 #include <sys/fs/snode.h>
  64 #include <sys/copyops.h>
  65 #include <sys/conf.h>
  66 #include <sys/sdt.h>
  67
  68 #include <vm/anon.h>
  69 #include <vm/hat.h>
  70 #include <vm/as.h>
  71 #include <vm/seg.h>
  72 #include <vm/page.h>
  73 #include <vm/seg_vn.h>
  74 #include <vm/seg_kmem.h>
  75
  76 extern int maxphys;
  77
  78 void
  79 minphys(struct buf *bp)
  80 {
  81         if (bp->b_bcount > maxphys)
  82                 bp->b_bcount = maxphys;
  83 }
  84
  85 /*
  86  * use kmem_cache_create for physio buffers. This has shown
  87  * a better cache distribution compared to buffers on the
  88  * stack. It also avoids semaphore construction/deconstruction
  89  * per request
  90  */
  91
  92 static struct kmem_cache *physio_buf_cache;
  93
  94 /* ARGSUSED */
  95 static int
  96 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
  97 {
  98         bioinit((struct buf *)buf);
  99         return (0);
 100 }
 101
 102 /* ARGSUSED */
 103 static void
 104 physio_buf_destructor(void *buf, void *cdrarg)
 105 {
 106         biofini((struct buf *)buf);
 107 }
 108
 109 void
 110 physio_bufs_init(void)
 111 {
 112         physio_buf_cache = kmem_cache_create("physio_buf_cache",
 113             sizeof (struct buf), 0, physio_buf_constructor,
 114             physio_buf_destructor, NULL, NULL, NULL, 0);
 115 }
 116
 117
 118
 119 /*
 120  * initiate raw I/O request
 121  *
 122  * allocate buf header if necessary
 123  * adjust max size of each I/O request
 124  * lock down user pages and verify access protections
 125  * call driver's strategy routine to submit request
 126  * wait for I/O completion
 127  * unlock user pages and free allocated buf header
 128  */
 129
 130 int
 131 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
 132         int rw, void (*mincnt)(struct buf *), struct uio *uio)
 133 {
 134         struct iovec *iov;
 135         struct proc *procp;
 136         struct as *asp;
 137         ssize_t c;
 138         char *a;
 139         int error = 0;
 140         page_t **pplist;
 141         int allocbuf = 0;
 142
 143         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
 144
 145         /* Kernel probe */
 146         TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
 147             tnf_device,         device,         dev,
 148             tnf_offset,         offset,         uio->uio_loffset,
 149             tnf_size,           size,           uio->uio_resid,
 150             tnf_bioflags,       rw,             rw);
 151
 152         if (rw == B_READ) {
 153                 CPU_STATS_ADD_K(sys, phread, 1);
 154         } else {
 155                 CPU_STATS_ADD_K(sys, phwrite, 1);
 156         }
 157
 158         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
 159             "getbuf_start: bp %p", bp);
 160
 161         if (bp == NULL) {
 162                 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
 163                 bp->b_iodone = NULL;
 164                 bp->b_resid = 0;
 165                 allocbuf = 1;
 166         }
 167         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
 168
 169         if (uio->uio_segflg == UIO_USERSPACE) {
 170                 procp = ttoproc(curthread);
 171                 asp = procp->p_as;
 172         } else {
 173                 procp = NULL;
 174                 asp = &kas;
 175         }
 176         ASSERT(SEMA_HELD(&bp->b_sem));
 177
 178         /*
 179          * We need to prepare this buffer for the io:::start probe, including
 180          * NULL'ing out the file, clearing the offset, and filling in the
 181          * b_dip field.
 182          */
 183         bp->b_file = NULL;
 184         bp->b_offset = -1;
 185
 186         if (dev != NODEV) {
 187                 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
 188                     DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
 189         } else {
 190                 bp->b_dip = NULL;
 191         }
 192
 193         while (uio->uio_iovcnt > 0) {
 194                 iov = uio->uio_iov;
 195
 196                 bp->b_error = 0;
 197                 bp->b_proc = procp;
 198
 199                 while (iov->iov_len > 0) {
 200                         if (uio->uio_resid == 0)
 201                                 break;
 202                         if (uio->uio_loffset < 0) {
 203                                 error = EINVAL;
 204                                 break;
 205                         }
 206 #ifdef  _ILP32
 207                         /*
 208                          * For 32-bit kernels, check against SPEC_MAXOFFSET_T
 209                          * which represents the maximum size that can be
 210                          * supported by the IO subsystem.
 211                          * XXX this code assumes a D_64BIT driver.
 212                          */
 213                         if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
 214                                 error = EINVAL;
 215                                 break;
 216                         }
 217 #endif  /* _ILP32 */
 218                         bp->b_flags = B_BUSY | B_PHYS | rw;
 219                         bp->b_edev = dev;
 220                         bp->b_lblkno = btodt(uio->uio_loffset);
 221
 222                         /*
 223                          * Don't count on b_addr remaining untouched by the
 224                          * code below (it may be reset because someone does
 225                          * a bp_mapin on the buffer) -- reset from the iov
 226                          * each time through, updating the iov's base address
 227                          * instead.
 228                          */
 229                         a = bp->b_un.b_addr = iov->iov_base;
 230                         bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
 231                         (*mincnt)(bp);
 232                         c = bp->b_bcount;
 233
 234                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
 235                             "as_pagelock_start: bp %p", bp);
 236
 237                         error = as_pagelock(asp, &pplist, a,
 238                             c, rw == B_READ? S_WRITE : S_READ);
 239
 240                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
 241                             "as_pagelock_end:");
 242
 243                         if (error != 0) {
 244                                 bp->b_flags |= B_ERROR;
 245                                 bp->b_error = error;
 246                                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
 247                                 break;
 248                         }
 249                         bp->b_shadow = pplist;
 250                         if (pplist != NULL) {
 251                                 bp->b_flags |= B_SHADOW;
 252                         }
 253
 254                         DTRACE_IO1(start, struct buf *, bp);
 255                         bp->b_flags |= B_STARTED;
 256
 257                         (void) (*strat)(bp);
 258                         error = biowait(bp);
 259
 260                         /*
 261                          * unlock the pages
 262                          */
 263                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
 264                             "as_pageunlock_start: bp %p", bp);
 265
 266                         as_pageunlock(asp, pplist, a, c,
 267                             rw == B_READ? S_WRITE : S_READ);
 268
 269                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
 270                             "as_pageunlock_end:");
 271
 272                         c -= bp->b_resid;
 273                         iov->iov_base += c;
 274                         iov->iov_len -= c;
 275                         uio->uio_resid -= c;
 276                         uio->uio_loffset += c;
 277                         /* bp->b_resid - temp kludge for tape drives */
 278                         if (bp->b_resid || error)
 279                                 break;
 280                 }
 281                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 282                 /* bp->b_resid - temp kludge for tape drives */
 283                 if (bp->b_resid || error)
 284                         break;
 285                 uio->uio_iov++;
 286                 uio->uio_iovcnt--;
 287         }
 288
 289         if (allocbuf) {
 290                 kmem_cache_free(physio_buf_cache, bp);
 291         }
 292
 293         /* Kernel probe */
 294         TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
 295                 tnf_device,     device,         dev);
 296
 297         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
 298
 299         return (error);
 300 }
 301
 302 /*
 303  * Returns 0 on success, or an error on failure.
 304  *
 305  * This function is no longer a part of the DDI/DKI.
 306  * However, for compatibility, its interface should not
 307  * be changed and it should not be removed from the kernel.
 308  */
 309 int
 310 useracc(void *addr, size_t count, int access)
 311 {
 312         uint_t prot;
 313
 314         prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
 315         return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
 316 }
 317
 318 #define MAX_MAPIN_PAGES 8
 319
 320 /*
 321  * This function temporarily "borrows" user pages for kernel use. If
 322  * "cow" is on, it also sets up copy-on-write protection (only feasible
 323  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
 324  * pages from any changes by the user. The caller is responsible for
 325  * unlocking and tearing down cow settings when it's done with the pages.
 326  * For an example, see kcfree().
 327  *
 328  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
 329  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
 330  * kaddr != -1. On entering this function, cached_ppp contains a list
 331  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
 332  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
 333  * the kernel map won't need to be reloaded again.
 334  *
 335  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
 336  * reference count, and change the user-mapping to read-only. This
 337  * scheme should work on all types of segment drivers. But to be safe,
 338  * we check against segvn here.
 339  *
 340  * Since this function is used to emulate copyin() semantic, it checks
 341  * to make sure the user-mappings allow "user-read".
 342  *
 343  * On exit "lenp" contains the number of bytes successfully locked and
 344  * mapped in. For the unsuccessful ones, the caller can fall back to
 345  * copyin().
 346  *
 347  * Error return:
 348  * ENOTSUP - operation like this is not supported either on this segment
 349  * type, or on this platform type.
 350  */
 351 int
 352 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
 353     struct anon **app, size_t *lenp, int cow)
 354 {
 355         struct          hat *hat;
 356         struct seg      *seg;
 357         caddr_t         base;
 358         page_t          *pp, *ppp[MAX_MAPIN_PAGES];
 359         long            i;
 360         int             flags;
 361         size_t          size, total = *lenp;
 362         char            first = 1;
 363         faultcode_t     res;
 364
 365         *lenp = 0;
 366         if (cow) {
 367                 AS_LOCK_ENTER(as, RW_WRITER);
 368                 seg = as_findseg(as, uaddr, 0);
 369                 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
 370                     (uaddr + total) > base + seg->s_size) {
 371                         AS_LOCK_EXIT(as);
 372                         return (EINVAL);
 373                 }
 374                 /*
 375                  * The COW scheme should work for all segment types.
 376                  * But to be safe, we check against segvn.
 377                  */
 378                 if (seg->s_ops != &segvn_ops) {
 379                         AS_LOCK_EXIT(as);
 380                         return (ENOTSUP);
 381                 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
 382                         AS_LOCK_EXIT(as);
 383                         return (ENOTSUP);
 384                 }
 385         }
 386         hat = as->a_hat;
 387         size = total;
 388 tryagain:
 389         /*
 390          * If (cow), hat_softlock will also change the usr protection to RO.
 391          * This is the first step toward setting up cow. Before we
 392          * bump up an_refcnt, we can't allow any cow-fault on this
 393          * address. Otherwise segvn_fault will change the protection back
 394          * to RW upon seeing an_refcnt == 1.
 395          * The solution is to hold the writer lock on "as".
 396          */
 397         res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
 398         size = total - size;
 399         *lenp += size;
 400         size = size >> PAGESHIFT;
 401         i = 0;
 402         while (i < size) {
 403                 pp = ppp[i];
 404                 if (cow) {
 405                         kmutex_t *ahm;
 406                         /*
 407                          * Another solution is to hold SE_EXCL on pp, and
 408                          * disable PROT_WRITE. This also works for MAP_SHARED
 409                          * segment. The disadvantage is that it locks the
 410                          * page from being used by anybody else.
 411                          */
 412                         ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
 413                         mutex_enter(ahm);
 414                         *app = swap_anon(pp->p_vnode, pp->p_offset);
 415                         /*
 416                          * Since we are holding the as lock, this avoids a
 417                          * potential race with anon_decref. (segvn_unmap and
 418                          * segvn_free needs the as writer lock to do anon_free.)
 419                          */
 420                         if (*app != NULL) {
 421 #if 0
 422                                 if ((*app)->an_refcnt == 0)
 423                                 /*
 424                                  * Consider the following senario (unlikey
 425                                  * though):
 426                                  * 1. an_refcnt == 2
 427                                  * 2. we solftlock the page.
 428                                  * 3. cow ocurrs on this addr. So a new ap,
 429                                  * page and mapping is established on addr.
 430                                  * 4. an_refcnt drops to 1 (segvn_faultpage
 431                                  * -> anon_decref(oldap))
 432                                  * 5. the last ref to ap also drops (from
 433                                  * another as). It ends up blocked inside
 434                                  * anon_decref trying to get page's excl lock.
 435                                  * 6. Later kcfree unlocks the page, call
 436                                  * anon_decref -> oops, ap is gone already.
 437                                  *
 438                                  * Holding as writer lock solves all problems.
 439                                  */
 440                                         *app = NULL;
 441                                 else
 442 #endif
 443                                         (*app)->an_refcnt++;
 444                         }
 445                         mutex_exit(ahm);
 446                 } else {
 447                         *app = NULL;
 448                 }
 449                 if (kaddr != (caddr_t)-1) {
 450                         if (pp != *cached_ppp) {
 451                                 if (*cached_ppp == NULL)
 452                                         flags = HAT_LOAD_LOCK | HAT_NOSYNC |
 453                                             HAT_LOAD_NOCONSIST;
 454                                 else
 455                                         flags = HAT_LOAD_REMAP |
 456                                             HAT_LOAD_NOCONSIST;
 457                                 /*
 458                                  * In order to cache the kernel mapping after
 459                                  * the user page is unlocked, we call
 460                                  * hat_devload instead of hat_memload so
 461                                  * that the kernel mapping we set up here is
 462                                  * "invisible" to the rest of the world. This
 463                                  * is not very pretty. But as long as the
 464                                  * caller bears the responsibility of keeping
 465                                  * cache consistency, we should be ok -
 466                                  * HAT_NOCONSIST will get us a uncached
 467                                  * mapping on VAC. hat_softlock will flush
 468                                  * a VAC_WRITEBACK cache. Therefore the kaddr
 469                                  * doesn't have to be of the same vcolor as
 470                                  * uaddr.
 471                                  * The alternative is - change hat_devload
 472                                  * to get a cached mapping. Allocate a kaddr
 473                                  * with the same vcolor as uaddr. Then
 474                                  * hat_softlock won't need to flush the VAC.
 475                                  */
 476                                 hat_devload(kas.a_hat, kaddr, PAGESIZE,
 477                                     page_pptonum(pp), PROT_READ, flags);
 478                                 *cached_ppp = pp;
 479                         }
 480                         kaddr += PAGESIZE;
 481                 }
 482                 cached_ppp++;
 483                 app++;
 484                 ++i;
 485         }
 486         if (cow) {
 487                 AS_LOCK_EXIT(as);
 488         }
 489         if (first && res == FC_NOMAP) {
 490                 /*
 491                  * If the address is not mapped yet, we call as_fault to
 492                  * fault the pages in. We could've fallen back to copy and
 493                  * let it fault in the pages. But for a mapped file, we
 494                  * normally reference each page only once. For zero-copy to
 495                  * be of any use, we'd better fall in the page now and try
 496                  * again.
 497                  */
 498                 first = 0;
 499                 size = size << PAGESHIFT;
 500                 uaddr += size;
 501                 total -= size;
 502                 size = total;
 503                 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
 504                 if (cow)
 505                         AS_LOCK_ENTER(as, RW_WRITER);
 506                 goto tryagain;
 507         }
 508         switch (res) {
 509         case FC_NOSUPPORT:
 510                 return (ENOTSUP);
 511         case FC_PROT:   /* Pretend we don't know about it. This will be */
 512                         /* caught by the caller when uiomove fails. */
 513         case FC_NOMAP:
 514         case FC_OBJERR:
 515         default:
 516                 return (0);
 517         }
 518 }