kernel/os/vm_subr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24
  25 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26 /*        All Rights Reserved   */
  27
  28 /*
  29  * University Copyright- Copyright (c) 1982, 1986, 1988
  30  * The Regents of the University of California
  31  * All Rights Reserved
  32  *
  33  * University Acknowledgment- Portions of this document are derived from
  34  * software developed by the University of California, Berkeley, and its
  35  * contributors.
  36  */
  37
  38 #include <sys/types.h>
  39 #include <sys/t_lock.h>
  40 #include <sys/param.h>
  41 #include <sys/errno.h>
  42 #include <sys/debug.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/kmem.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/inline.h>
  47 #include <sys/buf.h>
  48 #include <sys/uio.h>
  49 #include <sys/user.h>
  50 #include <sys/proc.h>
  51 #include <sys/systm.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/cpuvar.h>
  54 #include <sys/mman.h>
  55 #include <sys/cred.h>
  56 #include <sys/vnode.h>
  57 #include <sys/file.h>
  58 #include <sys/vm.h>
  59
  60 #include <sys/swap.h>
  61 #include <sys/vtrace.h>
  62 #include <sys/tnf_probe.h>
  63 #include <sys/fs/snode.h>
  64 #include <sys/copyops.h>
  65 #include <sys/conf.h>
  66 #include <sys/sdt.h>
  67
  68 #include <vm/anon.h>
  69 #include <vm/hat.h>
  70 #include <vm/as.h>
  71 #include <vm/seg.h>
  72 #include <vm/page.h>
  73 #include <vm/seg_vn.h>
  74 #include <vm/seg_kmem.h>
  75
  76 extern int maxphys;
  77
  78 void
  79 minphys(struct buf *bp)
  80 {
  81         if (bp->b_bcount > maxphys)
  82                 bp->b_bcount = maxphys;
  83 }
  84
  85 /*
  86  * use kmem_cache_create for physio buffers. This has shown
  87  * a better cache distribution compared to buffers on the
  88  * stack. It also avoids semaphore construction/deconstruction
  89  * per request
  90  */
  91
  92 static struct kmem_cache *physio_buf_cache;
  93
  94 /* ARGSUSED */
  95 static int
  96 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
  97 {
  98         bioinit((struct buf *)buf);
  99         return (0);
 100 }
 101
 102 /* ARGSUSED */
 103 static void
 104 physio_buf_destructor(void *buf, void *cdrarg)
 105 {
 106         biofini((struct buf *)buf);
 107 }
 108
 109 void
 110 physio_bufs_init(void)
 111 {
 112         physio_buf_cache = kmem_cache_create("physio_buf_cache",
 113             sizeof (struct buf), 0, physio_buf_constructor,
 114             physio_buf_destructor, NULL, NULL, NULL, 0);
 115 }
 116
 117
 118
 119 /*
 120  * initiate raw I/O request
 121  *
 122  * allocate buf header if necessary
 123  * adjust max size of each I/O request
 124  * lock down user pages and verify access protections
 125  * call driver's strategy routine to submit request
 126  * wait for I/O completion
 127  * unlock user pages and free allocated buf header
 128  */
 129
 130 int
 131 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
 132         int rw, void (*mincnt)(struct buf *), struct uio *uio)
 133 {
 134         struct iovec *iov;
 135         struct proc *procp;
 136         struct as *asp;
 137         ssize_t c;
 138         char *a;
 139         int error = 0;
 140         page_t **pplist;
 141         int allocbuf = 0;
 142
 143         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
 144
 145         if (rw == B_READ) {
 146                 CPU_STATS_ADD_K(sys, phread, 1);
 147         } else {
 148                 CPU_STATS_ADD_K(sys, phwrite, 1);
 149         }
 150
 151         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
 152             "getbuf_start: bp %p", bp);
 153
 154         if (bp == NULL) {
 155                 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
 156                 bp->b_iodone = NULL;
 157                 bp->b_resid = 0;
 158                 allocbuf = 1;
 159         }
 160         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
 161
 162         if (uio->uio_segflg == UIO_USERSPACE) {
 163                 procp = ttoproc(curthread);
 164                 asp = procp->p_as;
 165         } else {
 166                 procp = NULL;
 167                 asp = &kas;
 168         }
 169         ASSERT(SEMA_HELD(&bp->b_sem));
 170
 171         /*
 172          * We need to prepare this buffer for the io:::start probe, including
 173          * NULL'ing out the file, clearing the offset, and filling in the
 174          * b_dip field.
 175          */
 176         bp->b_file = NULL;
 177         bp->b_offset = -1;
 178
 179         if (dev != NODEV) {
 180                 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
 181                     DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
 182         } else {
 183                 bp->b_dip = NULL;
 184         }
 185
 186         while (uio->uio_iovcnt > 0) {
 187                 iov = uio->uio_iov;
 188
 189                 bp->b_error = 0;
 190                 bp->b_proc = procp;
 191
 192                 while (iov->iov_len > 0) {
 193                         if (uio->uio_resid == 0)
 194                                 break;
 195                         if (uio->uio_loffset < 0) {
 196                                 error = EINVAL;
 197                                 break;
 198                         }
 199 #ifdef  _ILP32
 200                         /*
 201                          * For 32-bit kernels, check against SPEC_MAXOFFSET_T
 202                          * which represents the maximum size that can be
 203                          * supported by the IO subsystem.
 204                          * XXX this code assumes a D_64BIT driver.
 205                          */
 206                         if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
 207                                 error = EINVAL;
 208                                 break;
 209                         }
 210 #endif  /* _ILP32 */
 211                         bp->b_flags = B_BUSY | B_PHYS | rw;
 212                         bp->b_edev = dev;
 213                         bp->b_lblkno = btodt(uio->uio_loffset);
 214
 215                         /*
 216                          * Don't count on b_addr remaining untouched by the
 217                          * code below (it may be reset because someone does
 218                          * a bp_mapin on the buffer) -- reset from the iov
 219                          * each time through, updating the iov's base address
 220                          * instead.
 221                          */
 222                         a = bp->b_un.b_addr = iov->iov_base;
 223                         bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
 224                         (*mincnt)(bp);
 225                         c = bp->b_bcount;
 226
 227                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
 228                             "as_pagelock_start: bp %p", bp);
 229
 230                         error = as_pagelock(asp, &pplist, a,
 231                             c, rw == B_READ? S_WRITE : S_READ);
 232
 233                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
 234                             "as_pagelock_end:");
 235
 236                         if (error != 0) {
 237                                 bp->b_flags |= B_ERROR;
 238                                 bp->b_error = error;
 239                                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
 240                                 break;
 241                         }
 242                         bp->b_shadow = pplist;
 243                         if (pplist != NULL) {
 244                                 bp->b_flags |= B_SHADOW;
 245                         }
 246
 247                         DTRACE_IO1(start, struct buf *, bp);
 248                         bp->b_flags |= B_STARTED;
 249
 250                         (void) (*strat)(bp);
 251                         error = biowait(bp);
 252
 253                         /*
 254                          * unlock the pages
 255                          */
 256                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
 257                             "as_pageunlock_start: bp %p", bp);
 258
 259                         as_pageunlock(asp, pplist, a, c,
 260                             rw == B_READ? S_WRITE : S_READ);
 261
 262                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
 263                             "as_pageunlock_end:");
 264
 265                         c -= bp->b_resid;
 266                         iov->iov_base += c;
 267                         iov->iov_len -= c;
 268                         uio->uio_resid -= c;
 269                         uio->uio_loffset += c;
 270                         /* bp->b_resid - temp kludge for tape drives */
 271                         if (bp->b_resid || error)
 272                                 break;
 273                 }
 274                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 275                 /* bp->b_resid - temp kludge for tape drives */
 276                 if (bp->b_resid || error)
 277                         break;
 278                 uio->uio_iov++;
 279                 uio->uio_iovcnt--;
 280         }
 281
 282         if (allocbuf) {
 283                 kmem_cache_free(physio_buf_cache, bp);
 284         }
 285
 286         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
 287
 288         return (error);
 289 }
 290
 291 /*
 292  * Returns 0 on success, or an error on failure.
 293  *
 294  * This function is no longer a part of the DDI/DKI.
 295  * However, for compatibility, its interface should not
 296  * be changed and it should not be removed from the kernel.
 297  */
 298 int
 299 useracc(void *addr, size_t count, int access)
 300 {
 301         uint_t prot;
 302
 303         prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
 304         return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
 305 }
 306
 307 #define MAX_MAPIN_PAGES 8
 308
 309 /*
 310  * This function temporarily "borrows" user pages for kernel use. If
 311  * "cow" is on, it also sets up copy-on-write protection (only feasible
 312  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
 313  * pages from any changes by the user. The caller is responsible for
 314  * unlocking and tearing down cow settings when it's done with the pages.
 315  * For an example, see kcfree().
 316  *
 317  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
 318  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
 319  * kaddr != -1. On entering this function, cached_ppp contains a list
 320  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
 321  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
 322  * the kernel map won't need to be reloaded again.
 323  *
 324  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
 325  * reference count, and change the user-mapping to read-only. This
 326  * scheme should work on all types of segment drivers. But to be safe,
 327  * we check against segvn here.
 328  *
 329  * Since this function is used to emulate copyin() semantic, it checks
 330  * to make sure the user-mappings allow "user-read".
 331  *
 332  * On exit "lenp" contains the number of bytes successfully locked and
 333  * mapped in. For the unsuccessful ones, the caller can fall back to
 334  * copyin().
 335  *
 336  * Error return:
 337  * ENOTSUP - operation like this is not supported either on this segment
 338  * type, or on this platform type.
 339  */
 340 int
 341 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
 342     struct anon **app, size_t *lenp, int cow)
 343 {
 344         struct          hat *hat;
 345         struct seg      *seg;
 346         caddr_t         base;
 347         page_t          *pp, *ppp[MAX_MAPIN_PAGES];
 348         long            i;
 349         int             flags;
 350         size_t          size, total = *lenp;
 351         char            first = 1;
 352         faultcode_t     res;
 353
 354         *lenp = 0;
 355         if (cow) {
 356                 AS_LOCK_ENTER(as, RW_WRITER);
 357                 seg = as_findseg(as, uaddr, 0);
 358                 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
 359                     (uaddr + total) > base + seg->s_size) {
 360                         AS_LOCK_EXIT(as);
 361                         return (EINVAL);
 362                 }
 363                 /*
 364                  * The COW scheme should work for all segment types.
 365                  * But to be safe, we check against segvn.
 366                  */
 367                 if (seg->s_ops != &segvn_ops) {
 368                         AS_LOCK_EXIT(as);
 369                         return (ENOTSUP);
 370                 } else if ((segop_gettype(seg, uaddr) & MAP_PRIVATE) == 0) {
 371                         AS_LOCK_EXIT(as);
 372                         return (ENOTSUP);
 373                 }
 374         }
 375         hat = as->a_hat;
 376         size = total;
 377 tryagain:
 378         /*
 379          * If (cow), hat_softlock will also change the usr protection to RO.
 380          * This is the first step toward setting up cow. Before we
 381          * bump up an_refcnt, we can't allow any cow-fault on this
 382          * address. Otherwise segvn_fault will change the protection back
 383          * to RW upon seeing an_refcnt == 1.
 384          * The solution is to hold the writer lock on "as".
 385          */
 386         res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
 387         size = total - size;
 388         *lenp += size;
 389         size = size >> PAGESHIFT;
 390         i = 0;
 391         while (i < size) {
 392                 pp = ppp[i];
 393                 if (cow) {
 394                         kmutex_t *ahm;
 395                         /*
 396                          * Another solution is to hold SE_EXCL on pp, and
 397                          * disable PROT_WRITE. This also works for MAP_SHARED
 398                          * segment. The disadvantage is that it locks the
 399                          * page from being used by anybody else.
 400                          */
 401                         ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
 402                         mutex_enter(ahm);
 403                         *app = swap_anon(pp->p_vnode, pp->p_offset);
 404                         /*
 405                          * Since we are holding the as lock, this avoids a
 406                          * potential race with anon_decref. (segvn_unmap and
 407                          * segvn_free needs the as writer lock to do anon_free.)
 408                          */
 409                         if (*app != NULL) {
 410 #if 0
 411                                 if ((*app)->an_refcnt == 0)
 412                                 /*
 413                                  * Consider the following senario (unlikey
 414                                  * though):
 415                                  * 1. an_refcnt == 2
 416                                  * 2. we solftlock the page.
 417                                  * 3. cow ocurrs on this addr. So a new ap,
 418                                  * page and mapping is established on addr.
 419                                  * 4. an_refcnt drops to 1 (segvn_faultpage
 420                                  * -> anon_decref(oldap))
 421                                  * 5. the last ref to ap also drops (from
 422                                  * another as). It ends up blocked inside
 423                                  * anon_decref trying to get page's excl lock.
 424                                  * 6. Later kcfree unlocks the page, call
 425                                  * anon_decref -> oops, ap is gone already.
 426                                  *
 427                                  * Holding as writer lock solves all problems.
 428                                  */
 429                                         *app = NULL;
 430                                 else
 431 #endif
 432                                         (*app)->an_refcnt++;
 433                         }
 434                         mutex_exit(ahm);
 435                 } else {
 436                         *app = NULL;
 437                 }
 438                 if (kaddr != (caddr_t)-1) {
 439                         if (pp != *cached_ppp) {
 440                                 if (*cached_ppp == NULL)
 441                                         flags = HAT_LOAD_LOCK | HAT_NOSYNC |
 442                                             HAT_LOAD_NOCONSIST;
 443                                 else
 444                                         flags = HAT_LOAD_REMAP |
 445                                             HAT_LOAD_NOCONSIST;
 446                                 /*
 447                                  * In order to cache the kernel mapping after
 448                                  * the user page is unlocked, we call
 449                                  * hat_devload instead of hat_memload so
 450                                  * that the kernel mapping we set up here is
 451                                  * "invisible" to the rest of the world. This
 452                                  * is not very pretty. But as long as the
 453                                  * caller bears the responsibility of keeping
 454                                  * cache consistency, we should be ok -
 455                                  * HAT_NOCONSIST will get us a uncached
 456                                  * mapping on VAC. hat_softlock will flush
 457                                  * a VAC_WRITEBACK cache. Therefore the kaddr
 458                                  * doesn't have to be of the same vcolor as
 459                                  * uaddr.
 460                                  * The alternative is - change hat_devload
 461                                  * to get a cached mapping. Allocate a kaddr
 462                                  * with the same vcolor as uaddr. Then
 463                                  * hat_softlock won't need to flush the VAC.
 464                                  */
 465                                 hat_devload(kas.a_hat, kaddr, PAGESIZE,
 466                                     page_pptonum(pp), PROT_READ, flags);
 467                                 *cached_ppp = pp;
 468                         }
 469                         kaddr += PAGESIZE;
 470                 }
 471                 cached_ppp++;
 472                 app++;
 473                 ++i;
 474         }
 475         if (cow) {
 476                 AS_LOCK_EXIT(as);
 477         }
 478         if (first && res == FC_NOMAP) {
 479                 /*
 480                  * If the address is not mapped yet, we call as_fault to
 481                  * fault the pages in. We could've fallen back to copy and
 482                  * let it fault in the pages. But for a mapped file, we
 483                  * normally reference each page only once. For zero-copy to
 484                  * be of any use, we'd better fall in the page now and try
 485                  * again.
 486                  */
 487                 first = 0;
 488                 size = size << PAGESHIFT;
 489                 uaddr += size;
 490                 total -= size;
 491                 size = total;
 492                 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
 493                 if (cow)
 494                         AS_LOCK_ENTER(as, RW_WRITER);
 495                 goto tryagain;
 496         }
 497         switch (res) {
 498         case FC_NOSUPPORT:
 499                 return (ENOTSUP);
 500         case FC_PROT:   /* Pretend we don't know about it. This will be */
 501                         /* caught by the caller when uiomove fails. */
 502         case FC_NOMAP:
 503         case FC_OBJERR:
 504         default:
 505                 return (0);
 506         }
 507 }