kernel/vm/vpm.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26
  27 /*
  28  * VM - generic vnode page mapping interfaces.
  29  *
  30  * Mechanism to provide temporary mappings to vnode pages.
  31  * The typical use would be to copy/access file data.
  32  */
  33
  34 #include <sys/types.h>
  35 #include <sys/t_lock.h>
  36 #include <sys/param.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/buf.h>
  39 #include <sys/systm.h>
  40 #include <sys/vnode.h>
  41 #include <sys/mman.h>
  42 #include <sys/errno.h>
  43 #include <sys/cred.h>
  44 #include <sys/kmem.h>
  45 #include <sys/vtrace.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/debug.h>
  48 #include <sys/thread.h>
  49 #include <sys/dumphdr.h>
  50 #include <sys/bitmap.h>
  51 #include <sys/lgrp.h>
  52
  53 #include <vm/seg_kmem.h>
  54 #include <vm/hat.h>
  55 #include <vm/as.h>
  56 #include <vm/seg.h>
  57 #include <vm/seg_kpm.h>
  58 #include <vm/seg_map.h>
  59 #include <vm/page.h>
  60 #include <vm/pvn.h>
  61 #include <vm/rm.h>
  62 #include <vm/vpm.h>
  63
  64
  65 #ifdef  SEGKPM_SUPPORT
  66 /*
  67  * VPM can be disabled by setting vpm_enable = 0 in
  68  * /etc/system.
  69  *
  70  */
  71 int vpm_enable = 1;
  72
  73 #else
  74
  75 int vpm_enable = 0;
  76
  77 #endif
  78
  79 #ifdef  SEGKPM_SUPPORT
  80
  81
  82 int     vpm_cache_enable = 1;
  83 long    vpm_cache_percent = 12;
  84 long    vpm_cache_size;
  85 int     vpm_nfreelist = 0;
  86 int     vpmd_freemsk = 0;
  87
  88 #define VPM_S_PAD       64
  89 union vpm_cpu {
  90         struct {
  91                 int     vcpu_free_ndx;
  92                 ulong_t vcpu_hits;
  93                 ulong_t vcpu_misses;
  94         } vcpu;
  95         char vpm_pad[VPM_S_PAD];
  96 };
  97 static union vpm_cpu    *vpmd_cpu;
  98
  99 #define vfree_ndx       vcpu.vcpu_free_ndx
 100
 101 int     vpm_cachemode = VPMCACHE_LRU;
 102
 103 #define PPMTX(pp) (&(pp)->p_ilock)
 104
 105 static struct vpmap *vpmd_vpmap;        /* list of vpmap structs preallocated */
 106 static struct vpmfree *vpmd_free;
 107 #define VPMAPMTX(vpm)   (&vpm->vpm_mtx)
 108 #define VPMAP2VMF(vpm)  (&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
 109 #define VPMAP2VMF_NDX(vpm)      (ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
 110 #define VPMP(id)        (&vpmd_vpmap[id - 1])
 111 #define VPMID(vpm)      (uint_t)((vpm - vpmd_vpmap) + 1)
 112
 113
 114 #ifdef  DEBUG
 115
 116 struct  vpm_debug {
 117         int vpmd_steals;
 118         int vpmd_contend;
 119         int vpmd_prevpagelocked;
 120         int vpmd_getpagefailed;
 121         int vpmd_zerostart;
 122         int vpmd_emptyfreelist;
 123         int vpmd_nofreevpms;
 124 } vpm_debug;
 125
 126 #define VPM_DEBUG(x)    ((vpm_debug.x)++)
 127
 128 int     steals;
 129 int     steals_mtbf = 7;
 130 int     contend;
 131 int     contend_mtbf = 127;
 132
 133 #define VPM_MTBF(v, f)  (((++(v)) & (f)) != (f))
 134
 135 #else   /* DEBUG */
 136
 137 #define VPM_MTBF(v, f)  (1)
 138 #define VPM_DEBUG(x)    /* nothing */
 139
 140 #endif
 141
 142 /*
 143  * The vpm cache.
 144  *
 145  * The main purpose of having a cache here is to speed up page_lookup()
 146  * operations and also provide an LRU(default) behaviour of file pages. The
 147  * page_lookup() operation tends to be expensive if a page has to be
 148  * reclaimed from the system page cache("cachelist"). Once we speed up the
 149  * page_lookup()->page_reclaim() path then there there should be no need for
 150  * this cache. The system page cache(cachelist) should effectively serve the
 151  * purpose of caching file pages.
 152  *
 153  * This cache is very similar to segmap's smap cache. Each page in the
 154  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
 155  * hash table. The page_t has a reference to the vpmap_t when cached. For a
 156  * given vnode, offset the page is found by means of a page_lookup() operation.
 157  * Any page which has a mapping(i.e when cached) will not be in the
 158  * system 'cachelist'. Hence the page_lookup() will not have to do a
 159  * page_reclaim(). That is how the cache serves to speed up page_lookup()
 160  * operations.
 161  *
 162  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
 163  */
 164
 165 void
 166 vpm_init()
 167 {
 168         long  npages;
 169         struct vpmap *vpm;
 170         struct vpmfree *vpmflp;
 171         int i, ndx;
 172         extern void prefetch_smap_w(void *);
 173
 174         if (!kpm_enable) {
 175                 vpm_enable = 0;
 176         }
 177
 178         if (!vpm_enable || !vpm_cache_enable) {
 179                 return;
 180         }
 181
 182         /*
 183          * Set the size of the cache.
 184          */
 185         vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
 186         if (vpm_cache_size < VPMAP_MINCACHE) {
 187                 vpm_cache_size = VPMAP_MINCACHE;
 188         }
 189
 190         if (vpm_cache_size > VPMAP_MAXCACHE) {
 191                 vpm_cache_size = VPMAP_MAXCACHE;
 192         }
 193
 194         /*
 195          * Number of freelists.
 196          */
 197         if (vpm_nfreelist == 0) {
 198                 vpm_nfreelist = max_ncpus;
 199         } else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
 200                 cmn_err(CE_WARN, "vpmap create : number of freelist "
 201                 "vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
 202                 vpm_nfreelist = 2 * max_ncpus;
 203         }
 204
 205         /*
 206          * Round it up to the next power of 2
 207          */
 208         if (!ISP2(vpm_nfreelist)) {
 209                 vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
 210         }
 211         vpmd_freemsk = vpm_nfreelist - 1;
 212
 213         /*
 214          * Use a per cpu rotor index to spread the allocations evenly
 215          * across the available vpm freelists.
 216          */
 217         vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
 218         ndx = 0;
 219         for (i = 0; i < max_ncpus; i++) {
 220
 221                 vpmd_cpu[i].vfree_ndx = ndx;
 222                 ndx = (ndx + 1) & vpmd_freemsk;
 223         }
 224
 225         /*
 226          * Allocate and initialize the freelist.
 227          */
 228         vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
 229             KM_SLEEP);
 230         for (i = 0; i < vpm_nfreelist; i++) {
 231
 232                 vpmflp = &vpmd_free[i];
 233                 /*
 234                  * Set up initial queue pointers. They will get flipped
 235                  * back and forth.
 236                  */
 237                 vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
 238                 vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
 239         }
 240
 241         npages = mmu_btop(vpm_cache_size);
 242
 243
 244         /*
 245          * Allocate and initialize the vpmap structs. We need to
 246          * walk the array backwards as the prefetch happens in reverse
 247          * order.
 248          */
 249         vpmd_vpmap = kmem_alloc(sizeof (struct vpmap) * npages, KM_SLEEP);
 250         for (vpm = &vpmd_vpmap[npages - 1]; vpm >= vpmd_vpmap; vpm--) {
 251                 struct vpmfree *vpmflp;
 252                 union vpm_freeq *releq;
 253                 struct vpmap *vpmapf;
 254
 255                 /*
 256                  * Use prefetch as we have to walk thru a large number of
 257                  * these data structures. We just use the smap's prefetch
 258                  * routine as it does the same.
 259                  */
 260                 prefetch_smap_w((void *)vpm);
 261
 262                 vpm->vpm_vp = NULL;
 263                 vpm->vpm_off = 0;
 264                 vpm->vpm_pp = NULL;
 265                 vpm->vpm_refcnt = 0;
 266                 mutex_init(&vpm->vpm_mtx, NULL, MUTEX_DEFAULT, NULL);
 267                 vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
 268
 269                 vpmflp = VPMAP2VMF(vpm);
 270                 releq = vpmflp->vpm_releq;
 271
 272                 vpmapf = releq->vpmq_free;
 273                 if (vpmapf == NULL) {
 274                         releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
 275                 } else {
 276                         vpm->vpm_next = vpmapf;
 277                         vpm->vpm_prev = vpmapf->vpm_prev;
 278                         vpmapf->vpm_prev = vpm;
 279                         vpm->vpm_prev->vpm_next = vpm;
 280                         releq->vpmq_free = vpm->vpm_next;
 281                 }
 282
 283                 /*
 284                  * Indicate that the vpmap is on the releq at start
 285                  */
 286                 vpm->vpm_ndxflg = VPMRELEQ;
 287         }
 288 }
 289
 290
 291 /*
 292  * unhooks vpm from the freelist if it is still on the freelist.
 293  */
 294 #define VPMAP_RMFREELIST(vpm) \
 295         { \
 296                 if (vpm->vpm_next != NULL) { \
 297                         union vpm_freeq *freeq; \
 298                         struct vpmfree *vpmflp; \
 299                         vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
 300                         freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
 301                         mutex_enter(&freeq->vpmq_mtx); \
 302                         if (freeq->vpmq_free != vpm) { \
 303                                 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
 304                                 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
 305                         } else if (vpm == vpm->vpm_next) { \
 306                                 freeq->vpmq_free = NULL; \
 307                         } else { \
 308                                 freeq->vpmq_free = vpm->vpm_next; \
 309                                 vpm->vpm_prev->vpm_next = vpm->vpm_next; \
 310                                 vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
 311                         } \
 312                         mutex_exit(&freeq->vpmq_mtx); \
 313                         vpm->vpm_next = vpm->vpm_prev = NULL; \
 314                 } \
 315         }
 316
 317 static int
 318 get_freelndx(int mode)
 319 {
 320         int ndx;
 321
 322         ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
 323         switch (mode) {
 324
 325         case    VPMCACHE_LRU:
 326         default:
 327                         vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
 328                         break;
 329         }
 330         return (ndx);
 331 }
 332
 333
 334 /*
 335  * Find one vpmap structure from the free lists and use it for the newpage.
 336  * The previous page it cached is dissociated and released. The page_t's
 337  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
 338  * for AMD64 when the page is exclusively locked in page_unload. That is
 339  * because the p_vpmref is treated as mapping).
 340  *
 341  * The page's p_vpmref is set when the page is
 342  * locked(at least SHARED locked).
 343  */
 344 static struct vpmap *
 345 get_free_vpmap(page_t *newpage)
 346 {
 347         struct vpmfree *vpmflp;
 348         kmutex_t *vmtx;
 349         struct vpmap *vpm, *first;
 350         union vpm_freeq *allocq, *releq;
 351         page_t *pp = NULL;
 352         int end_ndx, page_locked = 0;
 353         int free_ndx;
 354
 355         /*
 356          * get the freelist bin index.
 357          */
 358         free_ndx = get_freelndx(vpm_cachemode);
 359
 360         end_ndx = free_ndx;
 361         vpmflp = &vpmd_free[free_ndx];
 362
 363 retry_queue:
 364         allocq = vpmflp->vpm_allocq;
 365         mutex_enter(&allocq->vpmq_mtx);
 366
 367         if ((vpm = allocq->vpmq_free) == NULL) {
 368
 369 skip_queue:
 370                 /*
 371                  * The alloc list is empty or this queue is being skipped;
 372                  * first see if the allocq toggled.
 373                  */
 374                 if (vpmflp->vpm_allocq != allocq) {
 375                         /* queue changed */
 376                         mutex_exit(&allocq->vpmq_mtx);
 377                         goto retry_queue;
 378                 }
 379                 releq = vpmflp->vpm_releq;
 380                 if (!mutex_tryenter(&releq->vpmq_mtx)) {
 381                         /* cannot get releq; a free vpmap may be there now */
 382                         mutex_exit(&allocq->vpmq_mtx);
 383
 384                         /*
 385                          * This loop could spin forever if this thread has
 386                          * higher priority than the thread that is holding
 387                          * releq->vpmq_mtx. In order to force the other thread
 388                          * to run, we'll lock/unlock the mutex which is safe
 389                          * since we just unlocked the allocq mutex.
 390                          */
 391                         mutex_enter(&releq->vpmq_mtx);
 392                         mutex_exit(&releq->vpmq_mtx);
 393                         goto retry_queue;
 394                 }
 395                 if (releq->vpmq_free == NULL) {
 396                         VPM_DEBUG(vpmd_emptyfreelist);
 397                         /*
 398                          * This freelist is empty.
 399                          * This should not happen unless clients
 400                          * are failing to release the vpmap after
 401                          * accessing the data. Before resorting
 402                          * to sleeping, try the next list of the same color.
 403                          */
 404                         free_ndx = (free_ndx + 1) & vpmd_freemsk;
 405                         if (free_ndx != end_ndx) {
 406                                 mutex_exit(&releq->vpmq_mtx);
 407                                 mutex_exit(&allocq->vpmq_mtx);
 408                                 vpmflp = &vpmd_free[free_ndx];
 409                                 goto retry_queue;
 410                         }
 411                         /*
 412                          * Tried all freelists.
 413                          * wait on this list and hope something gets freed.
 414                          */
 415                         vpmflp->vpm_want++;
 416                         mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
 417                         cv_wait(&vpmflp->vpm_free_cv,
 418                             &vpmflp->vpm_freeq[0].vpmq_mtx);
 419                         vpmflp->vpm_want--;
 420                         mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
 421                         vpmflp = &vpmd_free[free_ndx];
 422                         VPM_DEBUG(vpmd_nofreevpms);
 423                         goto retry_queue;
 424                 } else {
 425                         /*
 426                          * Something on the rele queue; flip the alloc
 427                          * and rele queues and retry.
 428                          */
 429                         vpmflp->vpm_allocq = releq;
 430                         vpmflp->vpm_releq = allocq;
 431                         mutex_exit(&allocq->vpmq_mtx);
 432                         mutex_exit(&releq->vpmq_mtx);
 433                         if (page_locked) {
 434                                 ddi_msleep(250);
 435                                 page_locked = 0;
 436                         }
 437                         goto retry_queue;
 438                 }
 439         } else {
 440                 int gotnewvpm;
 441                 kmutex_t *pmtx;
 442                 uint_t vpmref;
 443
 444                 /*
 445                  * Fastpath the case we get the vpmap mutex
 446                  * on the first try.
 447                  */
 448                 first = vpm;
 449 next_vpmap:
 450                 vmtx = VPMAPMTX(vpm);
 451                 if (!mutex_tryenter(vmtx)) {
 452                         /*
 453                          * Another thread is trying to reclaim this slot.
 454                          * Skip to the next queue or vpmap.
 455                          */
 456                         if ((vpm = vpm->vpm_next) == first) {
 457                                 goto skip_queue;
 458                         } else {
 459                                 goto next_vpmap;
 460                         }
 461                 }
 462
 463                 /*
 464                  * Assign this vpm to the newpage.
 465                  */
 466                 pmtx = PPMTX(newpage);
 467                 gotnewvpm = 0;
 468                 mutex_enter(pmtx);
 469
 470                 /*
 471                  * Check if some other thread already assigned a vpm to
 472                  * this page.
 473                  */
 474                 if ((vpmref = newpage->p_vpmref) == 0) {
 475                         newpage->p_vpmref = VPMID(vpm);
 476                         gotnewvpm = 1;
 477                 } else {
 478                         VPM_DEBUG(vpmd_contend);
 479                         mutex_exit(vmtx);
 480                 }
 481                 mutex_exit(pmtx);
 482
 483                 if (gotnewvpm) {
 484
 485                         /*
 486                          * At this point, we've selected the vpm. Remove vpm
 487                          * from its freelist. If vpm is the first one in
 488                          * the freelist, update the head of the freelist.
 489                          */
 490                         if (first == vpm) {
 491                                 ASSERT(first == allocq->vpmq_free);
 492                                 allocq->vpmq_free = vpm->vpm_next;
 493                         }
 494
 495                         /*
 496                          * If the head of the freelist still points to vpm,
 497                          * then there are no more free vpmaps in that list.
 498                          */
 499                         if (allocq->vpmq_free == vpm)
 500                                 /*
 501                                  * Took the last one
 502                                  */
 503                                 allocq->vpmq_free = NULL;
 504                         else {
 505                                 vpm->vpm_prev->vpm_next = vpm->vpm_next;
 506                                 vpm->vpm_next->vpm_prev = vpm->vpm_prev;
 507                         }
 508                         mutex_exit(&allocq->vpmq_mtx);
 509                         vpm->vpm_prev = vpm->vpm_next = NULL;
 510
 511                         /*
 512                          * Disassociate the previous page.
 513                          * p_vpmref is used as a mapping reference to the page.
 514                          */
 515                         if ((pp = vpm->vpm_pp) != NULL &&
 516                             vpm->vpm_vp == pp->p_vnode &&
 517                             vpm->vpm_off == pp->p_offset) {
 518
 519                                 pmtx = PPMTX(pp);
 520                                 if (page_trylock(pp, SE_SHARED)) {
 521                                         /*
 522                                          * Now verify that it is the correct
 523                                          * page. If not someone else stole it,
 524                                          * so just unlock it and leave.
 525                                          */
 526                                         mutex_enter(pmtx);
 527                                         if (PP_ISFREE(pp) ||
 528                                             vpm->vpm_vp != pp->p_vnode ||
 529                                             vpm->vpm_off != pp->p_offset ||
 530                                             pp->p_vpmref != VPMID(vpm)) {
 531                                                 mutex_exit(pmtx);
 532
 533                                                 page_unlock(pp);
 534                                         } else {
 535                                                 /*
 536                                                  * Release the page.
 537                                                  */
 538                                                 pp->p_vpmref = 0;
 539                                                 mutex_exit(pmtx);
 540                                                 (void) page_release(pp, 1);
 541                                         }
 542                                 } else {
 543                                         /*
 544                                          * If the page cannot be locked, just
 545                                          * clear the p_vpmref and go.
 546                                          */
 547                                         mutex_enter(pmtx);
 548                                         if (pp->p_vpmref == VPMID(vpm)) {
 549                                                 pp->p_vpmref = 0;
 550                                         }
 551                                         mutex_exit(pmtx);
 552                                         VPM_DEBUG(vpmd_prevpagelocked);
 553                                 }
 554                         }
 555
 556                         /*
 557                          * Setup vpm to point to the new page.
 558                          */
 559                         vpm->vpm_pp = newpage;
 560                         vpm->vpm_vp = newpage->p_vnode;
 561                         vpm->vpm_off = newpage->p_offset;
 562
 563                 } else {
 564                         int steal = !VPM_MTBF(steals, steals_mtbf);
 565                         /*
 566                          * Page already has a vpm assigned just use that.
 567                          * Grab the vpm mutex and verify that it is still
 568                          * the correct one. The pp->p_vpmref should not change
 569                          * once we have the vpm mutex and the page lock.
 570                          */
 571                         mutex_exit(&allocq->vpmq_mtx);
 572                         vpm = VPMP(vpmref);
 573                         vmtx = VPMAPMTX(vpm);
 574                         mutex_enter(vmtx);
 575                         if ((steal && vpm->vpm_refcnt == 0) ||
 576                             vpm->vpm_pp != newpage) {
 577                                 /*
 578                                  * The vpm got stolen, retry.
 579                                  * clear the p_vpmref.
 580                                  */
 581                                 pmtx = PPMTX(newpage);
 582                                 mutex_enter(pmtx);
 583                                 if (newpage->p_vpmref == vpmref) {
 584                                         newpage->p_vpmref = 0;
 585                                 }
 586                                 mutex_exit(pmtx);
 587
 588                                 mutex_exit(vmtx);
 589                                 VPM_DEBUG(vpmd_steals);
 590                                 goto retry_queue;
 591                         } else if (vpm->vpm_refcnt == 0) {
 592                                 /*
 593                                  * Remove it from the free list if it
 594                                  * exists there.
 595                                  */
 596                                 VPMAP_RMFREELIST(vpm);
 597                         }
 598                 }
 599                 return (vpm);
 600         }
 601 }
 602
 603 static void
 604 free_vpmap(struct vpmap *vpm)
 605 {
 606         struct vpmfree *vpmflp;
 607         struct vpmap *vpmfreelist;
 608         union vpm_freeq *releq;
 609
 610         ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
 611
 612         if (vpm->vpm_refcnt != 0) {
 613                 panic("free_vpmap");
 614                 /*NOTREACHED*/
 615         }
 616
 617         vpmflp = &vpmd_free[vpm->vpm_free_ndx];
 618         /*
 619          * Add to the tail of the release queue
 620          * Note that vpm_releq and vpm_allocq could toggle
 621          * before we get the lock. This does not affect
 622          * correctness as the 2 queues are only maintained
 623          * to reduce lock pressure.
 624          */
 625         releq = vpmflp->vpm_releq;
 626         if (releq == &vpmflp->vpm_freeq[0]) {
 627                 vpm->vpm_ndxflg = 0;
 628         } else {
 629                 vpm->vpm_ndxflg = 1;
 630         }
 631         mutex_enter(&releq->vpmq_mtx);
 632         vpmfreelist = releq->vpmq_free;
 633         if (vpmfreelist == 0) {
 634                 int want;
 635
 636                 releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
 637                 /*
 638                  * Both queue mutexes are held to set vpm_want;
 639                  * snapshot the value before dropping releq mutex.
 640                  * If vpm_want appears after the releq mutex is dropped,
 641                  * then the vpmap just freed is already gone.
 642                  */
 643                 want = vpmflp->vpm_want;
 644                 mutex_exit(&releq->vpmq_mtx);
 645                 /*
 646                  * See if there was a waiter before dropping the releq mutex
 647                  * then recheck after obtaining vpm_freeq[0] mutex as
 648                  * the another thread may have already signaled.
 649                  */
 650                 if (want) {
 651                         mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
 652                         if (vpmflp->vpm_want)
 653                                 cv_signal(&vpmflp->vpm_free_cv);
 654                         mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
 655                 }
 656         } else {
 657                 vpm->vpm_next = vpmfreelist;
 658                 vpm->vpm_prev = vpmfreelist->vpm_prev;
 659                 vpmfreelist->vpm_prev = vpm;
 660                 vpm->vpm_prev->vpm_next = vpm;
 661                 mutex_exit(&releq->vpmq_mtx);
 662         }
 663 }
 664
 665 /*
 666  * Get the vpmap for the page.
 667  * The refcnt of this vpm is incremented.
 668  */
 669 static struct vpmap *
 670 get_vpmap(page_t *pp)
 671 {
 672         struct vpmap *vpm = NULL;
 673         kmutex_t *vmtx;
 674         kmutex_t *pmtx;
 675         unsigned int refid;
 676
 677         ASSERT((pp != NULL) && PAGE_LOCKED(pp));
 678
 679         if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
 680                 vpm = VPMP(refid);
 681                 vmtx = VPMAPMTX(vpm);
 682                 mutex_enter(vmtx);
 683                 /*
 684                  * Since we have the page lock and the vpm mutex, the
 685                  * pp->p_vpmref cannot change.
 686                  */
 687                 if (vpm->vpm_pp != pp) {
 688                         pmtx = PPMTX(pp);
 689
 690                         /*
 691                          * Clear the p_vpmref as it is incorrect.
 692                          * This can happen if the page was stolen.
 693                          * On x64 this should not happen as p_vpmref
 694                          * is treated as a mapping on the page. So
 695                          * if the page is stolen, the mapping would have
 696                          * been cleared in page_unload().
 697                          */
 698                         mutex_enter(pmtx);
 699                         if (pp->p_vpmref == refid)
 700                                 pp->p_vpmref = 0;
 701                         mutex_exit(pmtx);
 702
 703                         mutex_exit(vmtx);
 704                         vpm = NULL;
 705                 } else if (vpm->vpm_refcnt == 0) {
 706                         /*
 707                          * Got the vpm, remove it from the free
 708                          * list if it exists there.
 709                          */
 710                         VPMAP_RMFREELIST(vpm);
 711                 }
 712         }
 713         if (vpm == NULL) {
 714                 /*
 715                  * get_free_vpmap() returns with the vpmap mutex held.
 716                  */
 717                 vpm = get_free_vpmap(pp);
 718                 vmtx = VPMAPMTX(vpm);
 719                 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
 720         } else {
 721                 vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
 722         }
 723
 724         vpm->vpm_refcnt++;
 725         mutex_exit(vmtx);
 726
 727         return (vpm);
 728 }
 729
 730 /* END --- vpm cache ---- */
 731
 732 /*
 733  * The vnode page mapping(vpm) interface routines.
 734  */
 735
 736 /*
 737  * Find or create the pages starting form baseoff for specified
 738  * length 'len'.
 739  */
 740 static int
 741 vpm_pagecreate(
 742         struct vnode *vp,
 743         uoff_t baseoff,
 744         size_t len,
 745         vmap_t vml[],
 746         int nseg,
 747         int *newpage)
 748 {
 749
 750         page_t *pp = NULL;
 751         caddr_t base;
 752         uoff_t off = baseoff;
 753         int i;
 754         ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
 755
 756         for (i = 0; len > 0; len -= PAGESIZE, i++) {
 757                 struct vpmap *vpm;
 758
 759
 760                 if ((pp = page_lookup(&vp->v_object, off, SE_SHARED)) == NULL) {
 761
 762                         base = segkpm_create_va(off);
 763
 764                         /*
 765                          * the seg pointer passed in is just advisor. Just
 766                          * pass segkmap for now like segmap does with
 767                          * segmap_kpm enabled.
 768                          */
 769                         if ((pp = page_create_va(&vp->v_object, off, PAGESIZE,
 770                                                  PG_WAIT, segkmap,
 771                                                  base)) == NULL) {
 772                                 panic("segmap_pagecreate_vpm: "
 773                                     "page_create failed");
 774                                 /*NOTREACHED*/
 775                         }
 776                         if (newpage != NULL)
 777                                 *newpage = 1;
 778
 779                         page_io_unlock(pp);
 780                 }
 781
 782                 /*
 783                  * Get the vpm for this page_t.
 784                  */
 785                 if (vpm_cache_enable) {
 786                         vpm = get_vpmap(pp);
 787                         vml[i].vs_data = (void *)&vpm->vpm_pp;
 788                 } else {
 789                         vml[i].vs_data = (void *)pp;
 790                         pp->p_vpmref = 0;
 791                 }
 792
 793                 vml[i].vs_addr = hat_kpm_mapin(pp, 0);
 794                 vml[i].vs_len = PAGESIZE;
 795
 796                 off += PAGESIZE;
 797         }
 798         vml[i].vs_data = NULL;
 799         vml[i].vs_addr = NULL;
 800         return (0);
 801 }
 802
 803
 804 /*
 805  * Returns vpm mappings of pages in the range [off, off+len], where
 806  * len is rounded up to the PAGESIZE boundary. The list of pages and
 807  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
 808  * The nseg is the number of vmap_t entries in the array.
 809  *
 810  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
 811  * For such cases, use the seg_map interfaces.
 812  */
 813 int
 814 vpm_map_pages(
 815         struct vnode *vp,
 816         uoff_t off,
 817         size_t len,
 818         int fetchpage,
 819         vmap_t *vml,
 820         int nseg,
 821         int  *newpage,
 822         enum seg_rw rw)
 823 {
 824         extern struct vnode *common_specvp();
 825         uoff_t baseoff;
 826         uint_t prot;
 827         caddr_t base;
 828         page_t *pp, *pplist[MAXVMAPS];
 829         struct vpmap *vpm;
 830         int i, error = 0;
 831         size_t tlen;
 832
 833         ASSERT(nseg >= MINVMAPS && nseg <= MAXVMAPS);
 834         baseoff = off & (offset_t)PAGEMASK;
 835         vml[0].vs_data = NULL;
 836         vml[0].vs_addr = NULL;
 837
 838         tlen = P2ROUNDUP(off + len, PAGESIZE) - baseoff;
 839         /*
 840          * Restrict it to VPMMAXLEN.
 841          */
 842         if (tlen > (VPMMAXPGS * PAGESIZE)) {
 843                 tlen = VPMMAXPGS * PAGESIZE;
 844         }
 845         /*
 846          * Ensure length fits within the vml[] array. One element of
 847          * the array is used to mark the end of the scatter/gather list
 848          * of valid mappings by setting its vs_addr = NULL. Leave space
 849          * for this element.
 850          */
 851         if (tlen > ((nseg - 1) * PAGESIZE)) {
 852                 tlen = ((nseg - 1) * PAGESIZE);
 853         }
 854         len = tlen;
 855
 856         /*
 857          * If this is a block device we have to be sure to use the
 858          * "common" block device vnode for the mapping.
 859          */
 860         if (vp->v_type == VBLK)
 861                 vp = common_specvp(vp);
 862
 863
 864         if (!fetchpage)
 865                 return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
 866
 867         for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
 868
 869                 pp = page_lookup(&vp->v_object, baseoff, SE_SHARED);
 870
 871                 /*
 872                  * If we did not find the page or if this page was not
 873                  * in vpm cache(p_vpmref == 0), then let fop_getpage get
 874                  * all the pages.
 875                  * We need to call fop_getpage so that filesystems can do some
 876                  * (un)necessary tracking for sequential access.
 877                  */
 878
 879                 if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
 880                     (rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
 881                     != (P_MOD | P_REF))) {
 882                         int j;
 883                         if (pp != NULL) {
 884                                 page_unlock(pp);
 885                         }
 886                         /*
 887                          * If we did not find the desired set of pages,
 888                          * from the page cache, just call fop_getpage to get
 889                          * all the pages.
 890                          */
 891                         for (j = 0; j < i; j++) {
 892                                 page_unlock(pplist[j]);
 893                         }
 894
 895
 896                         baseoff = off & (offset_t)PAGEMASK;
 897                         /*
 898                          * Pass a dummy address as it will be required
 899                          * by page_create_va(). We pass segkmap as the seg
 900                          * as some file systems(UFS) check it.
 901                          */
 902                         base = segkpm_create_va(baseoff);
 903
 904                         error = fop_getpage(vp, baseoff, tlen, &prot, pplist,
 905                             tlen, segkmap, base, rw, CRED(), NULL);
 906                         if (error) {
 907                                 VPM_DEBUG(vpmd_getpagefailed);
 908                                 pplist[0] = NULL;
 909                         }
 910                         break;
 911                 } else {
 912                         pplist[i] = pp;
 913                         baseoff += PAGESIZE;
 914                 }
 915         }
 916
 917         if (error) {
 918                 for (i = 0; pplist[i] != NULL; i++) {
 919                         page_unlock(pplist[i]);
 920                         pplist[i] = NULL;
 921                 }
 922                 vml[0].vs_addr = NULL;
 923                 vml[0].vs_data = NULL;
 924                 return (error);
 925         }
 926
 927         /*
 928          * Get the vpm's for pages.
 929          */
 930         for (i = 0; pplist[i] != NULL; i++) {
 931                 if (vpm_cache_enable) {
 932                         vpm = get_vpmap(pplist[i]);
 933                         vml[i].vs_data = (void *)&(vpm->vpm_pp);
 934                 } else {
 935                         vml[i].vs_data = (void *)pplist[i];
 936                         pplist[i]->p_vpmref = 0;
 937                 }
 938
 939                 vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
 940                 vml[i].vs_len = PAGESIZE;
 941         }
 942
 943         vml[i].vs_data = NULL;
 944         vml[i].vs_addr = NULL;
 945
 946         return (0);
 947 }
 948
 949 /*
 950  * Release the vpm mappings on the pages and unlock them.
 951  */
 952 void
 953 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
 954 {
 955         int i;
 956         struct vpmap *vpm;
 957         kmutex_t *mtx;
 958         page_t *pp;
 959
 960         for (i = 0; vml[i].vs_data != NULL; i++) {
 961                 ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
 962
 963                 if (vpm_cache_enable) {
 964                         pp = *(((page_t **)vml[i].vs_data));
 965                 } else {
 966                         pp = (page_t *)vml[i].vs_data;
 967                 }
 968
 969                 /*
 970                  * Mark page as being modified or referenced, bacause vpm pages
 971                  * would not cause faults where it would be set normally.
 972                  */
 973                 if (rw == S_WRITE) {
 974                         hat_setrefmod(pp);
 975                 } else {
 976                         ASSERT(rw == S_READ);
 977                         hat_setref(pp);
 978                 }
 979
 980                 if (vpm_cache_enable) {
 981                         vpm = (struct vpmap *)((char *)vml[i].vs_data
 982                             - offsetof(struct vpmap, vpm_pp));
 983                         hat_kpm_mapout(pp, 0, vml[i].vs_addr);
 984                         page_unlock(pp);
 985                         mtx = VPMAPMTX(vpm);
 986                         mutex_enter(mtx);
 987
 988                         if (--vpm->vpm_refcnt == 0) {
 989                                 free_vpmap(vpm);
 990                         }
 991                         mutex_exit(mtx);
 992                 } else {
 993                         hat_kpm_mapout(pp, 0, vml[i].vs_addr);
 994                         (void) page_release(pp, 1);
 995                 }
 996                 vml[i].vs_data = NULL;
 997                 vml[i].vs_addr = NULL;
 998         }
 999 }
1000
1001 /*
1002  * Given the vp, off and the uio structure, this routine will do the
1003  * the copy (uiomove). If the last page created is partially written,
1004  * the rest of the page is zeroed out. It also zeros the beginning of
1005  * the first page till the start offset if requested(zerostart).
1006  * If pages are to be fetched, it will call the filesystem's getpage
1007  * function (fop_getpage) to get them, otherwise they will be created if
1008  * not already present in the page cache.
1009  */
1010 int
1011 vpm_data_copy(struct vnode *vp,
1012         uoff_t off,
1013         size_t len,
1014         struct uio *uio,
1015         int fetchpage,
1016         int *newpage,
1017         int zerostart,
1018         enum seg_rw rw)
1019 {
1020         int error;
1021         struct vmap vml[MINVMAPS];
1022         enum uio_rw uiorw;
1023         int npages = 0;
1024
1025         uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
1026         /*
1027          * 'off' will be the offset where the I/O starts.
1028          * We get the pages starting at the (off & PAGEMASK)
1029          * page boundary.
1030          */
1031         error = vpm_map_pages(vp, off, (uint_t)len,
1032             fetchpage, vml, MINVMAPS, &npages,  rw);
1033
1034         if (newpage != NULL)
1035                 *newpage = npages;
1036         if (!error) {
1037                 int i, pn, slen = len;
1038                 int pon = off & PAGEOFFSET;
1039
1040                 /*
1041                  * Clear from the beginning of the page to start offset
1042                  * if requested.
1043                  */
1044                 if (!fetchpage && zerostart) {
1045                         (void) kzero(vml[0].vs_addr,  (uint_t)pon);
1046                         VPM_DEBUG(vpmd_zerostart);
1047                 }
1048
1049                 for (i = 0; !error && slen > 0 &&
1050                     vml[i].vs_addr != NULL; i++) {
1051                         pn = (int)MIN(slen, (PAGESIZE - pon));
1052                         error = uiomove(vml[i].vs_addr + pon,
1053                             (long)pn, uiorw, uio);
1054                         slen -= pn;
1055                         pon = 0;
1056                 }
1057
1058                 /*
1059                  * When new pages are created, zero out part of the
1060                  * page we did not copy to.
1061                  */
1062                 if (!fetchpage && npages &&
1063                     uio->uio_loffset < roundup(off + len, PAGESIZE)) {
1064                         int nzero;
1065
1066                         pon = (uio->uio_loffset & PAGEOFFSET);
1067                         nzero = PAGESIZE  - pon;
1068                         i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
1069                         (void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
1070                 }
1071                 vpm_unmap_pages(vml, rw);
1072         }
1073         return (error);
1074 }
1075
1076 /*
1077  * called to flush pages for the given vnode covering
1078  * [off, off+len] range.
1079  */
1080 int
1081 vpm_sync_pages(struct vnode *vp,
1082                 uoff_t off,
1083                 size_t len,
1084                 uint_t flags)
1085 {
1086         extern struct vnode *common_specvp();
1087         int bflags = 0;
1088         int error = 0;
1089         size_t psize = roundup(len, PAGESIZE);
1090
1091         /*
1092          * If this is a block device we have to be sure to use the
1093          * "common" block device vnode for the mapping.
1094          */
1095         if (vp->v_type == VBLK)
1096                 vp = common_specvp(vp);
1097
1098         if ((flags & ~SM_DONTNEED) != 0) {
1099                 if (flags & SM_ASYNC)
1100                         bflags |= B_ASYNC;
1101                 if (flags & SM_INVAL)
1102                         bflags |= B_INVAL;
1103                 if (flags & SM_DESTROY)
1104                         bflags |= (B_INVAL|B_TRUNC);
1105                 if (flags & SM_FREE)
1106                         bflags |= B_FREE;
1107                 if (flags & SM_DONTNEED)
1108                         bflags |= B_DONTNEED;
1109
1110                 error = fop_putpage(vp, off, psize, bflags, CRED(), NULL);
1111         }
1112
1113         return (error);
1114 }
1115
1116
1117 #else   /* SEGKPM_SUPPORT */
1118
1119 /* vpm stubs */
1120 void
1121 vpm_init()
1122 {
1123 }
1124
1125 /*ARGSUSED*/
1126 int
1127 vpm_pagecreate(
1128         struct vnode *vp,
1129         uoff_t baseoff,
1130         size_t len,
1131         vmap_t vml[],
1132         int nseg,
1133         int *newpage)
1134 {
1135         return (0);
1136 }
1137
1138 /*ARGSUSED*/
1139 int
1140 vpm_map_pages(
1141         struct vnode *vp,
1142         uoff_t off,
1143         size_t len,
1144         int fetchpage,
1145         vmap_t vml[],
1146         int nseg,
1147         int *newpage,
1148         enum seg_rw rw)
1149 {
1150         return (0);
1151 }
1152
1153 /*ARGSUSED*/
1154 int
1155 vpm_data_copy(struct vnode *vp,
1156         uoff_t off,
1157         size_t len,
1158         struct uio *uio,
1159         int fetchpage,
1160         int *newpage,
1161         int zerostart,
1162         enum seg_rw rw)
1163 {
1164         return (0);
1165 }
1166
1167 /*ARGSUSED*/
1168 void
1169 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
1170 {
1171 }
1172 /*ARGSUSED*/
1173 int
1174 vpm_sync_pages(struct vnode *vp,
1175                 uoff_t off,
1176                 size_t len,
1177                 uint_t flags)
1178 {
1179         return (0);
1180 }
1181 #endif  /* SEGKPM_SUPPORT */