usr/src/uts/common/vm/seg_map.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92 static lgrp_mem_policy_info_t   *segmap_getpolicy(struct seg *seg,
  93     caddr_t addr);
  94 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  95
  96 /* segkpm support */
  97 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  98                         struct smap *, enum seg_rw);
  99 struct smap     *get_smap_kpm(caddr_t, page_t **);
 100
 101 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
 102
 103 static struct seg_ops segmap_ops = {
 104         SEGMAP_BADOP(int),      /* dup */
 105         SEGMAP_BADOP(int),      /* unmap */
 106         segmap_free,
 107         segmap_fault,
 108         segmap_faulta,
 109         SEGMAP_BADOP(int),      /* setprot */
 110         segmap_checkprot,
 111         segmap_kluster,
 112         SEGMAP_BADOP(size_t),   /* swapout */
 113         SEGMAP_BADOP(int),      /* sync */
 114         SEGMAP_BADOP(size_t),   /* incore */
 115         SEGMAP_BADOP(int),      /* lockop */
 116         segmap_getprot,
 117         segmap_getoffset,
 118         segmap_gettype,
 119         segmap_getvp,
 120         SEGMAP_BADOP(int),      /* advise */
 121         segmap_dump,
 122         segmap_pagelock,        /* pagelock */
 123         SEGMAP_BADOP(int),      /* setpgsz */
 124         segmap_getmemid,        /* getmemid */
 125         segmap_getpolicy,       /* getpolicy */
 126         segmap_capable,         /* capable */
 127         seg_inherit_notsup      /* inherit */
 128 };
 129
 130 /*
 131  * Private segmap routines.
 132  */
 133 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 134                         size_t len, enum seg_rw rw, struct smap *smp);
 135 static void     segmap_smapadd(struct smap *smp);
 136 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 137                         u_offset_t off, int hashid);
 138 static void     segmap_hashout(struct smap *smp);
 139
 140
 141 /*
 142  * Statistics for segmap operations.
 143  *
 144  * No explicit locking to protect these stats.
 145  */
 146 struct segmapcnt segmapcnt = {
 147         { "fault",              KSTAT_DATA_ULONG },
 148         { "faulta",             KSTAT_DATA_ULONG },
 149         { "getmap",             KSTAT_DATA_ULONG },
 150         { "get_use",            KSTAT_DATA_ULONG },
 151         { "get_reclaim",        KSTAT_DATA_ULONG },
 152         { "get_reuse",          KSTAT_DATA_ULONG },
 153         { "get_unused",         KSTAT_DATA_ULONG },
 154         { "get_nofree",         KSTAT_DATA_ULONG },
 155         { "rel_async",          KSTAT_DATA_ULONG },
 156         { "rel_write",          KSTAT_DATA_ULONG },
 157         { "rel_free",           KSTAT_DATA_ULONG },
 158         { "rel_abort",          KSTAT_DATA_ULONG },
 159         { "rel_dontneed",       KSTAT_DATA_ULONG },
 160         { "release",            KSTAT_DATA_ULONG },
 161         { "pagecreate",         KSTAT_DATA_ULONG },
 162         { "free_notfree",       KSTAT_DATA_ULONG },
 163         { "free_dirty",         KSTAT_DATA_ULONG },
 164         { "free",               KSTAT_DATA_ULONG },
 165         { "stolen",             KSTAT_DATA_ULONG },
 166         { "get_nomtx",          KSTAT_DATA_ULONG }
 167 };
 168
 169 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 170 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 171
 172 /*
 173  * Return number of map pages in segment.
 174  */
 175 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 176
 177 /*
 178  * Translate addr into smap number within segment.
 179  */
 180 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 181
 182 /*
 183  * Translate addr in seg into struct smap pointer.
 184  */
 185 #define GET_SMAP(seg, addr)     \
 186         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 187
 188 /*
 189  * Bit in map (16 bit bitmap).
 190  */
 191 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 192
 193 static int smd_colormsk = 0;
 194 static int smd_ncolor = 0;
 195 static int smd_nfree = 0;
 196 static int smd_freemsk = 0;
 197 #ifdef DEBUG
 198 static int *colors_used;
 199 #endif
 200 static struct smap *smd_smap;
 201 static struct smaphash *smd_hash;
 202 #ifdef SEGMAP_HASHSTATS
 203 static unsigned int *smd_hash_len;
 204 #endif
 205 static struct smfree *smd_free;
 206 static ulong_t smd_hashmsk = 0;
 207
 208 #define SEGMAP_MAXCOLOR         2
 209 #define SEGMAP_CACHE_PAD        64
 210
 211 union segmap_cpu {
 212         struct {
 213                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 214                 struct smap     *scpu_last_smap;
 215                 ulong_t         scpu_getmap;
 216                 ulong_t         scpu_release;
 217                 ulong_t         scpu_get_reclaim;
 218                 ulong_t         scpu_fault;
 219                 ulong_t         scpu_pagecreate;
 220                 ulong_t         scpu_get_reuse;
 221         } scpu;
 222         char    scpu_pad[SEGMAP_CACHE_PAD];
 223 };
 224 static union segmap_cpu *smd_cpu;
 225
 226 /*
 227  * There are three locks in seg_map:
 228  *      - per freelist mutexes
 229  *      - per hashchain mutexes
 230  *      - per smap mutexes
 231  *
 232  * The lock ordering is to get the smap mutex to lock down the slot
 233  * first then the hash lock (for hash in/out (vp, off) list) or the
 234  * freelist lock to put the slot back on the free list.
 235  *
 236  * The hash search is done by only holding the hashchain lock, when a wanted
 237  * slot is found, we drop the hashchain lock then lock the slot so there
 238  * is no overlapping of hashchain and smap locks. After the slot is
 239  * locked, we verify again if the slot is still what we are looking
 240  * for.
 241  *
 242  * Allocation of a free slot is done by holding the freelist lock,
 243  * then locking the smap slot at the head of the freelist. This is
 244  * in reversed lock order so mutex_tryenter() is used.
 245  *
 246  * The smap lock protects all fields in smap structure except for
 247  * the link fields for hash/free lists which are protected by
 248  * hashchain and freelist locks.
 249  */
 250
 251 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 252
 253 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 254 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 255
 256 #define SMAPMTX(smp) (&smp->sm_mtx)
 257
 258 #define SMAP_HASHFUNC(vp, off, hashid) \
 259         { \
 260         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 261                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 262         }
 263
 264 /*
 265  * The most frequently updated kstat counters are kept in the
 266  * per cpu array to avoid hot cache blocks. The update function
 267  * sums the cpu local counters to update the global counters.
 268  */
 269
 270 /* ARGSUSED */
 271 int
 272 segmap_kstat_update(kstat_t *ksp, int rw)
 273 {
 274         int i;
 275         ulong_t getmap, release, get_reclaim;
 276         ulong_t fault, pagecreate, get_reuse;
 277
 278         if (rw == KSTAT_WRITE)
 279                 return (EACCES);
 280         getmap = release = get_reclaim = (ulong_t)0;
 281         fault = pagecreate = get_reuse = (ulong_t)0;
 282         for (i = 0; i < max_ncpus; i++) {
 283                 getmap += smd_cpu[i].scpu.scpu_getmap;
 284                 release  += smd_cpu[i].scpu.scpu_release;
 285                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 286                 fault  += smd_cpu[i].scpu.scpu_fault;
 287                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 288                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 289         }
 290         segmapcnt.smp_getmap.value.ul = getmap;
 291         segmapcnt.smp_release.value.ul = release;
 292         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 293         segmapcnt.smp_fault.value.ul = fault;
 294         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 295         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 296         return (0);
 297 }
 298
 299 int
 300 segmap_create(struct seg *seg, void *argsp)
 301 {
 302         struct segmap_data *smd;
 303         struct smap *smp;
 304         struct smfree *sm;
 305         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 306         struct smaphash *shashp;
 307         union segmap_cpu *scpu;
 308         long i, npages;
 309         size_t hashsz;
 310         uint_t nfreelist;
 311         extern void prefetch_smap_w(void *);
 312         extern int max_ncpus;
 313
 314         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 315
 316         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 317                 panic("segkmap not MAXBSIZE aligned");
 318                 /*NOTREACHED*/
 319         }
 320
 321         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 322
 323         seg->s_data = (void *)smd;
 324         seg->s_ops = &segmap_ops;
 325         smd->smd_prot = a->prot;
 326
 327         /*
 328          * Scale the number of smap freelists to be
 329          * proportional to max_ncpus * number of virtual colors.
 330          * The caller can over-ride this scaling by providing
 331          * a non-zero a->nfreelist argument.
 332          */
 333         nfreelist = a->nfreelist;
 334         if (nfreelist == 0)
 335                 nfreelist = max_ncpus;
 336         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 337                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 338                 "%d, using %d", nfreelist, max_ncpus);
 339                 nfreelist = max_ncpus;
 340         }
 341         if (!ISP2(nfreelist)) {
 342                 /* round up nfreelist to the next power of two. */
 343                 nfreelist = 1 << (highbit(nfreelist));
 344         }
 345
 346         /*
 347          * Get the number of virtual colors - must be a power of 2.
 348          */
 349         if (a->shmsize)
 350                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 351         else
 352                 smd_ncolor = 1;
 353         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 354         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 355         smd_colormsk = smd_ncolor - 1;
 356         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 357         smd_freemsk = smd_nfree - 1;
 358
 359         /*
 360          * Allocate and initialize the freelist headers.
 361          * Note that sm_freeq[1] starts out as the release queue. This
 362          * is known when the smap structures are initialized below.
 363          */
 364         smd_free = smd->smd_free =
 365             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 366         for (i = 0; i < smd_nfree; i++) {
 367                 sm = &smd->smd_free[i];
 368                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 369                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 370                 sm->sm_allocq = &sm->sm_freeq[0];
 371                 sm->sm_releq = &sm->sm_freeq[1];
 372         }
 373
 374         /*
 375          * Allocate and initialize the smap hash chain headers.
 376          * Compute hash size rounding down to the next power of two.
 377          */
 378         npages = MAP_PAGES(seg);
 379         smd->smd_npages = npages;
 380         hashsz = npages / SMAP_HASHAVELEN;
 381         hashsz = 1 << (highbit(hashsz)-1);
 382         smd_hashmsk = hashsz - 1;
 383         smd_hash = smd->smd_hash =
 384             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 385 #ifdef SEGMAP_HASHSTATS
 386         smd_hash_len =
 387             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 388 #endif
 389         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 390                 shashp->sh_hash_list = NULL;
 391                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 392         }
 393
 394         /*
 395          * Allocate and initialize the smap structures.
 396          * Link all slots onto the appropriate freelist.
 397          * The smap array is large enough to affect boot time
 398          * on large systems, so use memory prefetching and only
 399          * go through the array 1 time. Inline a optimized version
 400          * of segmap_smapadd to add structures to freelists with
 401          * knowledge that no locks are needed here.
 402          */
 403         smd_smap = smd->smd_sm =
 404             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 405
 406         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 407             smp >= smd->smd_sm; smp--) {
 408                 struct smap *smpfreelist;
 409                 struct sm_freeq *releq;
 410
 411                 prefetch_smap_w((char *)smp);
 412
 413                 smp->sm_vp = NULL;
 414                 smp->sm_hash = NULL;
 415                 smp->sm_off = 0;
 416                 smp->sm_bitmap = 0;
 417                 smp->sm_refcnt = 0;
 418                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 419                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 420
 421                 sm = SMP2SMF(smp);
 422                 releq = sm->sm_releq;
 423
 424                 smpfreelist = releq->smq_free;
 425                 if (smpfreelist == 0) {
 426                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 427                 } else {
 428                         smp->sm_next = smpfreelist;
 429                         smp->sm_prev = smpfreelist->sm_prev;
 430                         smpfreelist->sm_prev = smp;
 431                         smp->sm_prev->sm_next = smp;
 432                         releq->smq_free = smp->sm_next;
 433                 }
 434
 435                 /*
 436                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 437                  */
 438                 smp->sm_flags = 0;
 439
 440 #ifdef  SEGKPM_SUPPORT
 441                 /*
 442                  * Due to the fragile prefetch loop no
 443                  * separate function is used here.
 444                  */
 445                 smp->sm_kpme_next = NULL;
 446                 smp->sm_kpme_prev = NULL;
 447                 smp->sm_kpme_page = NULL;
 448 #endif
 449         }
 450
 451         /*
 452          * Allocate the per color indices that distribute allocation
 453          * requests over the free lists. Each cpu will have a private
 454          * rotor index to spread the allocations even across the available
 455          * smap freelists. Init the scpu_last_smap field to the first
 456          * smap element so there is no need to check for NULL.
 457          */
 458         smd_cpu =
 459             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 460         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 461                 int j;
 462                 for (j = 0; j < smd_ncolor; j++)
 463                         scpu->scpu.scpu_free_ndx[j] = j;
 464                 scpu->scpu.scpu_last_smap = smd_smap;
 465         }
 466
 467         vpm_init();
 468
 469 #ifdef DEBUG
 470         /*
 471          * Keep track of which colors are used more often.
 472          */
 473         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 474 #endif /* DEBUG */
 475
 476         return (0);
 477 }
 478
 479 static void
 480 segmap_free(seg)
 481         struct seg *seg;
 482 {
 483         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 484 }
 485
 486 /*
 487  * Do a F_SOFTUNLOCK call over the range requested.
 488  * The range must have already been F_SOFTLOCK'ed.
 489  */
 490 static void
 491 segmap_unlock(
 492         struct hat *hat,
 493         struct seg *seg,
 494         caddr_t addr,
 495         size_t len,
 496         enum seg_rw rw,
 497         struct smap *smp)
 498 {
 499         page_t *pp;
 500         caddr_t adr;
 501         u_offset_t off;
 502         struct vnode *vp;
 503         kmutex_t *smtx;
 504
 505         ASSERT(smp->sm_refcnt > 0);
 506
 507 #ifdef lint
 508         seg = seg;
 509 #endif
 510
 511         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 512
 513                 /*
 514                  * We're called only from segmap_fault and this was a
 515                  * NOP in case of a kpm based smap, so dangerous things
 516                  * must have happened in the meantime. Pages are prefaulted
 517                  * and locked in segmap_getmapflt and they will not be
 518                  * unlocked until segmap_release.
 519                  */
 520                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 521                 /*NOTREACHED*/
 522         }
 523
 524         vp = smp->sm_vp;
 525         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 526
 527         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 528         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 529                 ushort_t bitmask;
 530
 531                 /*
 532                  * Use page_find() instead of page_lookup() to
 533                  * find the page since we know that it has
 534                  * "shared" lock.
 535                  */
 536                 pp = page_find(vp, off);
 537                 if (pp == NULL) {
 538                         panic("segmap_unlock: page not found");
 539                         /*NOTREACHED*/
 540                 }
 541
 542                 if (rw == S_WRITE) {
 543                         hat_setrefmod(pp);
 544                 } else if (rw != S_OTHER) {
 545                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 546                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 547                         hat_setref(pp);
 548                 }
 549
 550                 /*
 551                  * Clear bitmap, if the bit corresponding to "off" is set,
 552                  * since the page and translation are being unlocked.
 553                  */
 554                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 555
 556                 /*
 557                  * Large Files: Following assertion is to verify
 558                  * the correctness of the cast to (int) above.
 559                  */
 560                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 561                 smtx = SMAPMTX(smp);
 562                 mutex_enter(smtx);
 563                 if (smp->sm_bitmap & bitmask) {
 564                         smp->sm_bitmap &= ~bitmask;
 565                 }
 566                 mutex_exit(smtx);
 567
 568                 page_unlock(pp);
 569         }
 570 }
 571
 572 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 573
 574 /*
 575  * This routine is called via a machine specific fault handling
 576  * routine.  It is also called by software routines wishing to
 577  * lock or unlock a range of addresses.
 578  *
 579  * Note that this routine expects a page-aligned "addr".
 580  */
 581 faultcode_t
 582 segmap_fault(
 583         struct hat *hat,
 584         struct seg *seg,
 585         caddr_t addr,
 586         size_t len,
 587         enum fault_type type,
 588         enum seg_rw rw)
 589 {
 590         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 591         struct smap *smp;
 592         page_t *pp, **ppp;
 593         struct vnode *vp;
 594         u_offset_t off;
 595         page_t *pl[MAXPPB + 1];
 596         uint_t prot;
 597         u_offset_t addroff;
 598         caddr_t adr;
 599         int err;
 600         u_offset_t sm_off;
 601         int hat_flag;
 602
 603         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 604                 int newpage;
 605                 kmutex_t *smtx;
 606
 607                 /*
 608                  * Pages are successfully prefaulted and locked in
 609                  * segmap_getmapflt and can't be unlocked until
 610                  * segmap_release. No hat mappings have to be locked
 611                  * and they also can't be unlocked as long as the
 612                  * caller owns an active kpm addr.
 613                  */
 614 #ifndef DEBUG
 615                 if (type != F_SOFTUNLOCK)
 616                         return (0);
 617 #endif
 618
 619                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 620                         panic("segmap_fault: smap not found "
 621                             "for addr %p", (void *)addr);
 622                         /*NOTREACHED*/
 623                 }
 624
 625                 smtx = SMAPMTX(smp);
 626 #ifdef  DEBUG
 627                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 628                 if (newpage) {
 629                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 630                             (void *)smp);
 631                 }
 632
 633                 if (type != F_SOFTUNLOCK) {
 634                         mutex_exit(smtx);
 635                         return (0);
 636                 }
 637 #endif
 638                 mutex_exit(smtx);
 639                 vp = smp->sm_vp;
 640                 sm_off = smp->sm_off;
 641
 642                 if (vp == NULL)
 643                         return (FC_MAKE_ERR(EIO));
 644
 645                 ASSERT(smp->sm_refcnt > 0);
 646
 647                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 648                 if (addroff + len > MAXBSIZE)
 649                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 650                             (void *)(addr + len));
 651
 652                 off = sm_off + addroff;
 653
 654                 pp = page_find(vp, off);
 655
 656                 if (pp == NULL)
 657                         panic("segmap_fault: softunlock page not found");
 658
 659                 /*
 660                  * Set ref bit also here in case of S_OTHER to avoid the
 661                  * overhead of supporting other cases than F_SOFTUNLOCK
 662                  * with segkpm. We can do this because the underlying
 663                  * pages are locked anyway.
 664                  */
 665                 if (rw == S_WRITE) {
 666                         hat_setrefmod(pp);
 667                 } else {
 668                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 669                             "segmap_fault:pp %p vp %p offset %llx",
 670                             pp, vp, off);
 671                         hat_setref(pp);
 672                 }
 673
 674                 return (0);
 675         }
 676
 677         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 678         smp = GET_SMAP(seg, addr);
 679         vp = smp->sm_vp;
 680         sm_off = smp->sm_off;
 681
 682         if (vp == NULL)
 683                 return (FC_MAKE_ERR(EIO));
 684
 685         ASSERT(smp->sm_refcnt > 0);
 686
 687         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 688         if (addroff + len > MAXBSIZE) {
 689                 panic("segmap_fault: endaddr %p "
 690                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 691                 /*NOTREACHED*/
 692         }
 693         off = sm_off + addroff;
 694
 695         /*
 696          * First handle the easy stuff
 697          */
 698         if (type == F_SOFTUNLOCK) {
 699                 segmap_unlock(hat, seg, addr, len, rw, smp);
 700                 return (0);
 701         }
 702
 703         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 704             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 705         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 706             seg, addr, rw, CRED(), NULL);
 707
 708         if (err)
 709                 return (FC_MAKE_ERR(err));
 710
 711         prot &= smd->smd_prot;
 712
 713         /*
 714          * Handle all pages returned in the pl[] array.
 715          * This loop is coded on the assumption that if
 716          * there was no error from the VOP_GETPAGE routine,
 717          * that the page list returned will contain all the
 718          * needed pages for the vp from [off..off + len].
 719          */
 720         ppp = pl;
 721         while ((pp = *ppp++) != NULL) {
 722                 u_offset_t poff;
 723                 ASSERT(pp->p_vnode == vp);
 724                 hat_flag = HAT_LOAD;
 725
 726                 /*
 727                  * Verify that the pages returned are within the range
 728                  * of this segmap region.  Note that it is theoretically
 729                  * possible for pages outside this range to be returned,
 730                  * but it is not very likely.  If we cannot use the
 731                  * page here, just release it and go on to the next one.
 732                  */
 733                 if (pp->p_offset < sm_off ||
 734                     pp->p_offset >= sm_off + MAXBSIZE) {
 735                         (void) page_release(pp, 1);
 736                         continue;
 737                 }
 738
 739                 ASSERT(hat == kas.a_hat);
 740                 poff = pp->p_offset;
 741                 adr = addr + (poff - off);
 742                 if (adr >= addr && adr < addr + len) {
 743                         hat_setref(pp);
 744                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 745                             "segmap_fault:pp %p vp %p offset %llx",
 746                             pp, vp, poff);
 747                         if (type == F_SOFTLOCK)
 748                                 hat_flag = HAT_LOAD_LOCK;
 749                 }
 750
 751                 /*
 752                  * Deal with VMODSORT pages here. If we know this is a write
 753                  * do the setmod now and allow write protection.
 754                  * As long as it's modified or not S_OTHER, remove write
 755                  * protection. With S_OTHER it's up to the FS to deal with this.
 756                  */
 757                 if (IS_VMODSORT(vp)) {
 758                         if (rw == S_WRITE)
 759                                 hat_setmod(pp);
 760                         else if (rw != S_OTHER && !hat_ismod(pp))
 761                                 prot &= ~PROT_WRITE;
 762                 }
 763
 764                 hat_memload(hat, adr, pp, prot, hat_flag);
 765                 if (hat_flag != HAT_LOAD_LOCK)
 766                         page_unlock(pp);
 767         }
 768         return (0);
 769 }
 770
 771 /*
 772  * This routine is used to start I/O on pages asynchronously.
 773  */
 774 static faultcode_t
 775 segmap_faulta(struct seg *seg, caddr_t addr)
 776 {
 777         struct smap *smp;
 778         struct vnode *vp;
 779         u_offset_t off;
 780         int err;
 781
 782         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 783                 int     newpage;
 784                 kmutex_t *smtx;
 785
 786                 /*
 787                  * Pages are successfully prefaulted and locked in
 788                  * segmap_getmapflt and can't be unlocked until
 789                  * segmap_release. No hat mappings have to be locked
 790                  * and they also can't be unlocked as long as the
 791                  * caller owns an active kpm addr.
 792                  */
 793 #ifdef  DEBUG
 794                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 795                         panic("segmap_faulta: smap not found "
 796                             "for addr %p", (void *)addr);
 797                         /*NOTREACHED*/
 798                 }
 799
 800                 smtx = SMAPMTX(smp);
 801                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 802                 mutex_exit(smtx);
 803                 if (newpage)
 804                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 805                             (void *)smp);
 806 #endif
 807                 return (0);
 808         }
 809
 810         segmapcnt.smp_faulta.value.ul++;
 811         smp = GET_SMAP(seg, addr);
 812
 813         ASSERT(smp->sm_refcnt > 0);
 814
 815         vp = smp->sm_vp;
 816         off = smp->sm_off;
 817
 818         if (vp == NULL) {
 819                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 820                 return (FC_MAKE_ERR(EIO));
 821         }
 822
 823         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 824             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 825
 826         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 827             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 828             seg, addr, S_READ, CRED(), NULL);
 829
 830         if (err)
 831                 return (FC_MAKE_ERR(err));
 832         return (0);
 833 }
 834
 835 /*ARGSUSED*/
 836 static int
 837 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 838 {
 839         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 840
 841         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 842
 843         /*
 844          * Need not acquire the segment lock since
 845          * "smd_prot" is a read-only field.
 846          */
 847         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 848 }
 849
 850 static int
 851 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 852 {
 853         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 854         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 855
 856         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 857
 858         if (pgno != 0) {
 859                 do {
 860                         protv[--pgno] = smd->smd_prot;
 861                 } while (pgno != 0);
 862         }
 863         return (0);
 864 }
 865
 866 static u_offset_t
 867 segmap_getoffset(struct seg *seg, caddr_t addr)
 868 {
 869         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 870
 871         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 872
 873         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 874 }
 875
 876 /*ARGSUSED*/
 877 static int
 878 segmap_gettype(struct seg *seg, caddr_t addr)
 879 {
 880         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 881
 882         return (MAP_SHARED);
 883 }
 884
 885 /*ARGSUSED*/
 886 static int
 887 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 888 {
 889         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 890
 891         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 892
 893         /* XXX - This doesn't make any sense */
 894         *vpp = smd->smd_sm->sm_vp;
 895         return (0);
 896 }
 897
 898 /*
 899  * Check to see if it makes sense to do kluster/read ahead to
 900  * addr + delta relative to the mapping at addr.  We assume here
 901  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 902  *
 903  * For segmap we always "approve" of this action from our standpoint.
 904  */
 905 /*ARGSUSED*/
 906 static int
 907 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 908 {
 909         return (0);
 910 }
 911
 912 static void
 913 segmap_badop()
 914 {
 915         panic("segmap_badop");
 916         /*NOTREACHED*/
 917 }
 918
 919 /*
 920  * Special private segmap operations
 921  */
 922
 923 /*
 924  * Add smap to the appropriate free list.
 925  */
 926 static void
 927 segmap_smapadd(struct smap *smp)
 928 {
 929         struct smfree *sm;
 930         struct smap *smpfreelist;
 931         struct sm_freeq *releq;
 932
 933         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 934
 935         if (smp->sm_refcnt != 0) {
 936                 panic("segmap_smapadd");
 937                 /*NOTREACHED*/
 938         }
 939
 940         sm = &smd_free[smp->sm_free_ndx];
 941         /*
 942          * Add to the tail of the release queue
 943          * Note that sm_releq and sm_allocq could toggle
 944          * before we get the lock. This does not affect
 945          * correctness as the 2 queues are only maintained
 946          * to reduce lock pressure.
 947          */
 948         releq = sm->sm_releq;
 949         if (releq == &sm->sm_freeq[0])
 950                 smp->sm_flags |= SM_QNDX_ZERO;
 951         else
 952                 smp->sm_flags &= ~SM_QNDX_ZERO;
 953         mutex_enter(&releq->smq_mtx);
 954         smpfreelist = releq->smq_free;
 955         if (smpfreelist == 0) {
 956                 int want;
 957
 958                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 959                 /*
 960                  * Both queue mutexes held to set sm_want;
 961                  * snapshot the value before dropping releq mutex.
 962                  * If sm_want appears after the releq mutex is dropped,
 963                  * then the smap just freed is already gone.
 964                  */
 965                 want = sm->sm_want;
 966                 mutex_exit(&releq->smq_mtx);
 967                 /*
 968                  * See if there was a waiter before dropping the releq mutex
 969                  * then recheck after obtaining sm_freeq[0] mutex as
 970                  * the another thread may have already signaled.
 971                  */
 972                 if (want) {
 973                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 974                         if (sm->sm_want)
 975                                 cv_signal(&sm->sm_free_cv);
 976                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 977                 }
 978         } else {
 979                 smp->sm_next = smpfreelist;
 980                 smp->sm_prev = smpfreelist->sm_prev;
 981                 smpfreelist->sm_prev = smp;
 982                 smp->sm_prev->sm_next = smp;
 983                 mutex_exit(&releq->smq_mtx);
 984         }
 985 }
 986
 987
 988 static struct smap *
 989 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 990 {
 991         struct smap **hpp;
 992         struct smap *tmp;
 993         kmutex_t *hmtx;
 994
 995         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 996         ASSERT(smp->sm_vp == NULL);
 997         ASSERT(smp->sm_hash == NULL);
 998         ASSERT(smp->sm_prev == NULL);
 999         ASSERT(smp->sm_next == NULL);
1000         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
1001
1002         hmtx = SHASHMTX(hashid);
1003
1004         mutex_enter(hmtx);
1005         /*
1006          * First we need to verify that no one has created a smp
1007          * with (vp,off) as its tag before we us.
1008          */
1009         for (tmp = smd_hash[hashid].sh_hash_list;
1010             tmp != NULL; tmp = tmp->sm_hash)
1011                 if (tmp->sm_vp == vp && tmp->sm_off == off)
1012                         break;
1013
1014         if (tmp == NULL) {
1015                 /*
1016                  * No one created one yet.
1017                  *
1018                  * Funniness here - we don't increment the ref count on the
1019                  * vnode * even though we have another pointer to it here.
1020                  * The reason for this is that we don't want the fact that
1021                  * a seg_map entry somewhere refers to a vnode to prevent the
1022                  * vnode * itself from going away.  This is because this
1023                  * reference to the vnode is a "soft one".  In the case where
1024                  * a mapping is being used by a rdwr [or directory routine?]
1025                  * there already has to be a non-zero ref count on the vnode.
1026                  * In the case where the vp has been freed and the the smap
1027                  * structure is on the free list, there are no pages in memory
1028                  * that can refer to the vnode.  Thus even if we reuse the same
1029                  * vnode/smap structure for a vnode which has the same
1030                  * address but represents a different object, we are ok.
1031                  */
1032                 smp->sm_vp = vp;
1033                 smp->sm_off = off;
1034
1035                 hpp = &smd_hash[hashid].sh_hash_list;
1036                 smp->sm_hash = *hpp;
1037                 *hpp = smp;
1038 #ifdef SEGMAP_HASHSTATS
1039                 smd_hash_len[hashid]++;
1040 #endif
1041         }
1042         mutex_exit(hmtx);
1043
1044         return (tmp);
1045 }
1046
1047 static void
1048 segmap_hashout(struct smap *smp)
1049 {
1050         struct smap **hpp, *hp;
1051         struct vnode *vp;
1052         kmutex_t *mtx;
1053         int hashid;
1054         u_offset_t off;
1055
1056         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1057
1058         vp = smp->sm_vp;
1059         off = smp->sm_off;
1060
1061         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1062         mtx = SHASHMTX(hashid);
1063         mutex_enter(mtx);
1064
1065         hpp = &smd_hash[hashid].sh_hash_list;
1066         for (;;) {
1067                 hp = *hpp;
1068                 if (hp == NULL) {
1069                         panic("segmap_hashout");
1070                         /*NOTREACHED*/
1071                 }
1072                 if (hp == smp)
1073                         break;
1074                 hpp = &hp->sm_hash;
1075         }
1076
1077         *hpp = smp->sm_hash;
1078         smp->sm_hash = NULL;
1079 #ifdef SEGMAP_HASHSTATS
1080         smd_hash_len[hashid]--;
1081 #endif
1082         mutex_exit(mtx);
1083
1084         smp->sm_vp = NULL;
1085         smp->sm_off = (u_offset_t)0;
1086
1087 }
1088
1089 /*
1090  * Attempt to free unmodified, unmapped, and non locked segmap
1091  * pages.
1092  */
1093 void
1094 segmap_pagefree(struct vnode *vp, u_offset_t off)
1095 {
1096         u_offset_t pgoff;
1097         page_t  *pp;
1098
1099         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1100
1101                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1102                         continue;
1103
1104                 switch (page_release(pp, 1)) {
1105                 case PGREL_NOTREL:
1106                         segmapcnt.smp_free_notfree.value.ul++;
1107                         break;
1108                 case PGREL_MOD:
1109                         segmapcnt.smp_free_dirty.value.ul++;
1110                         break;
1111                 case PGREL_CLEAN:
1112                         segmapcnt.smp_free.value.ul++;
1113                         break;
1114                 }
1115         }
1116 }
1117
1118 /*
1119  * Locks held on entry: smap lock
1120  * Locks held on exit : smap lock.
1121  */
1122
1123 static void
1124 grab_smp(struct smap *smp, page_t *pp)
1125 {
1126         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1127         ASSERT(smp->sm_refcnt == 0);
1128
1129         if (smp->sm_vp != (struct vnode *)NULL) {
1130                 struct vnode    *vp = smp->sm_vp;
1131                 u_offset_t      off = smp->sm_off;
1132                 /*
1133                  * Destroy old vnode association and
1134                  * unload any hardware translations to
1135                  * the old object.
1136                  */
1137                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1138                 segmap_hashout(smp);
1139
1140                 /*
1141                  * This node is off freelist and hashlist,
1142                  * so there is no reason to drop/reacquire sm_mtx
1143                  * across calls to hat_unload.
1144                  */
1145                 if (segmap_kpm) {
1146                         caddr_t vaddr;
1147                         int hat_unload_needed = 0;
1148
1149                         /*
1150                          * unload kpm mapping
1151                          */
1152                         if (pp != NULL) {
1153                                 vaddr = hat_kpm_page2va(pp, 1);
1154                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1155                                 page_unlock(pp);
1156                         }
1157
1158                         /*
1159                          * Check if we have (also) the rare case of a
1160                          * non kpm mapping.
1161                          */
1162                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1163                                 hat_unload_needed = 1;
1164                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1165                         }
1166
1167                         if (hat_unload_needed) {
1168                                 hat_unload(kas.a_hat, segkmap->s_base +
1169                                     ((smp - smd_smap) * MAXBSIZE),
1170                                     MAXBSIZE, HAT_UNLOAD);
1171                         }
1172
1173                 } else {
1174                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1175                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1176                         hat_unload(kas.a_hat, segkmap->s_base +
1177                             ((smp - smd_smap) * MAXBSIZE),
1178                             MAXBSIZE, HAT_UNLOAD);
1179                 }
1180                 segmap_pagefree(vp, off);
1181         }
1182 }
1183
1184 static struct smap *
1185 get_free_smp(int free_ndx)
1186 {
1187         struct smfree *sm;
1188         kmutex_t *smtx;
1189         struct smap *smp, *first;
1190         struct sm_freeq *allocq, *releq;
1191         struct kpme *kpme;
1192         page_t *pp = NULL;
1193         int end_ndx, page_locked = 0;
1194
1195         end_ndx = free_ndx;
1196         sm = &smd_free[free_ndx];
1197
1198 retry_queue:
1199         allocq = sm->sm_allocq;
1200         mutex_enter(&allocq->smq_mtx);
1201
1202         if ((smp = allocq->smq_free) == NULL) {
1203
1204 skip_queue:
1205                 /*
1206                  * The alloc list is empty or this queue is being skipped;
1207                  * first see if the allocq toggled.
1208                  */
1209                 if (sm->sm_allocq != allocq) {
1210                         /* queue changed */
1211                         mutex_exit(&allocq->smq_mtx);
1212                         goto retry_queue;
1213                 }
1214                 releq = sm->sm_releq;
1215                 if (!mutex_tryenter(&releq->smq_mtx)) {
1216                         /* cannot get releq; a free smp may be there now */
1217                         mutex_exit(&allocq->smq_mtx);
1218
1219                         /*
1220                          * This loop could spin forever if this thread has
1221                          * higher priority than the thread that is holding
1222                          * releq->smq_mtx. In order to force the other thread
1223                          * to run, we'll lock/unlock the mutex which is safe
1224                          * since we just unlocked the allocq mutex.
1225                          */
1226                         mutex_enter(&releq->smq_mtx);
1227                         mutex_exit(&releq->smq_mtx);
1228                         goto retry_queue;
1229                 }
1230                 if (releq->smq_free == NULL) {
1231                         /*
1232                          * This freelist is empty.
1233                          * This should not happen unless clients
1234                          * are failing to release the segmap
1235                          * window after accessing the data.
1236                          * Before resorting to sleeping, try
1237                          * the next list of the same color.
1238                          */
1239                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1240                         if (free_ndx != end_ndx) {
1241                                 mutex_exit(&releq->smq_mtx);
1242                                 mutex_exit(&allocq->smq_mtx);
1243                                 sm = &smd_free[free_ndx];
1244                                 goto retry_queue;
1245                         }
1246                         /*
1247                          * Tried all freelists of the same color once,
1248                          * wait on this list and hope something gets freed.
1249                          */
1250                         segmapcnt.smp_get_nofree.value.ul++;
1251                         sm->sm_want++;
1252                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1253                         cv_wait(&sm->sm_free_cv,
1254                             &sm->sm_freeq[0].smq_mtx);
1255                         sm->sm_want--;
1256                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1257                         sm = &smd_free[free_ndx];
1258                         goto retry_queue;
1259                 } else {
1260                         /*
1261                          * Something on the rele queue; flip the alloc
1262                          * and rele queues and retry.
1263                          */
1264                         sm->sm_allocq = releq;
1265                         sm->sm_releq = allocq;
1266                         mutex_exit(&allocq->smq_mtx);
1267                         mutex_exit(&releq->smq_mtx);
1268                         if (page_locked) {
1269                                 delay(hz >> 2);
1270                                 page_locked = 0;
1271                         }
1272                         goto retry_queue;
1273                 }
1274         } else {
1275                 /*
1276                  * Fastpath the case we get the smap mutex
1277                  * on the first try.
1278                  */
1279                 first = smp;
1280 next_smap:
1281                 smtx = SMAPMTX(smp);
1282                 if (!mutex_tryenter(smtx)) {
1283                         /*
1284                          * Another thread is trying to reclaim this slot.
1285                          * Skip to the next queue or smap.
1286                          */
1287                         if ((smp = smp->sm_next) == first) {
1288                                 goto skip_queue;
1289                         } else {
1290                                 goto next_smap;
1291                         }
1292                 } else {
1293                         /*
1294                          * if kpme exists, get shared lock on the page
1295                          */
1296                         if (segmap_kpm && smp->sm_vp != NULL) {
1297
1298                                 kpme = GET_KPME(smp);
1299                                 pp = kpme->kpe_page;
1300
1301                                 if (pp != NULL) {
1302                                         if (!page_trylock(pp, SE_SHARED)) {
1303                                                 smp = smp->sm_next;
1304                                                 mutex_exit(smtx);
1305                                                 page_locked = 1;
1306
1307                                                 pp = NULL;
1308
1309                                                 if (smp == first) {
1310                                                         goto skip_queue;
1311                                                 } else {
1312                                                         goto next_smap;
1313                                                 }
1314                                         } else {
1315                                                 if (kpme->kpe_page == NULL) {
1316                                                         page_unlock(pp);
1317                                                         pp = NULL;
1318                                                 }
1319                                         }
1320                                 }
1321                         }
1322
1323                         /*
1324                          * At this point, we've selected smp.  Remove smp
1325                          * from its freelist.  If smp is the first one in
1326                          * the freelist, update the head of the freelist.
1327                          */
1328                         if (first == smp) {
1329                                 ASSERT(first == allocq->smq_free);
1330                                 allocq->smq_free = smp->sm_next;
1331                         }
1332
1333                         /*
1334                          * if the head of the freelist still points to smp,
1335                          * then there are no more free smaps in that list.
1336                          */
1337                         if (allocq->smq_free == smp)
1338                                 /*
1339                                  * Took the last one
1340                                  */
1341                                 allocq->smq_free = NULL;
1342                         else {
1343                                 smp->sm_prev->sm_next = smp->sm_next;
1344                                 smp->sm_next->sm_prev = smp->sm_prev;
1345                         }
1346                         mutex_exit(&allocq->smq_mtx);
1347                         smp->sm_prev = smp->sm_next = NULL;
1348
1349                         /*
1350                          * if pp != NULL, pp must have been locked;
1351                          * grab_smp() unlocks pp.
1352                          */
1353                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1354                         grab_smp(smp, pp);
1355                         /* return smp locked. */
1356                         ASSERT(SMAPMTX(smp) == smtx);
1357                         ASSERT(MUTEX_HELD(smtx));
1358                         return (smp);
1359                 }
1360         }
1361 }
1362
1363 /*
1364  * Special public segmap operations
1365  */
1366
1367 /*
1368  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1369  * If softlock is TRUE, then set things up so that it looks like a call
1370  * to segmap_fault with F_SOFTLOCK.
1371  *
1372  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1373  *
1374  * All fields in the generic segment (struct seg) are considered to be
1375  * read-only for "segmap" even though the kernel address space (kas) may
1376  * not be locked, hence no lock is needed to access them.
1377  */
1378 int
1379 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1380 {
1381         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1382         page_t *pp;
1383         u_offset_t off;
1384         struct smap *smp;
1385         struct vnode *vp;
1386         caddr_t eaddr;
1387         int newpage = 0;
1388         uint_t prot;
1389         kmutex_t *smtx;
1390         int hat_flag;
1391
1392         ASSERT(seg->s_as == &kas);
1393
1394         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1395                 /*
1396                  * Pages are successfully prefaulted and locked in
1397                  * segmap_getmapflt and can't be unlocked until
1398                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1399                  * in segmap_pagecreate_kpm when new pages are created.
1400                  * and it is returned as "newpage" indication here.
1401                  */
1402                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1403                         panic("segmap_pagecreate: smap not found "
1404                             "for addr %p", (void *)addr);
1405                         /*NOTREACHED*/
1406                 }
1407
1408                 smtx = SMAPMTX(smp);
1409                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1410                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1411                 mutex_exit(smtx);
1412
1413                 return (newpage);
1414         }
1415
1416         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1417
1418         eaddr = addr + len;
1419         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1420
1421         smp = GET_SMAP(seg, addr);
1422
1423         /*
1424          * We don't grab smp mutex here since we assume the smp
1425          * has a refcnt set already which prevents the slot from
1426          * changing its id.
1427          */
1428         ASSERT(smp->sm_refcnt > 0);
1429
1430         vp = smp->sm_vp;
1431         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1432         prot = smd->smd_prot;
1433
1434         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1435                 hat_flag = HAT_LOAD;
1436                 pp = page_lookup(vp, off, SE_SHARED);
1437                 if (pp == NULL) {
1438                         ushort_t bitindex;
1439
1440                         if ((pp = page_create_va(vp, off,
1441                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1442                                 panic("segmap_pagecreate: page_create failed");
1443                                 /*NOTREACHED*/
1444                         }
1445                         newpage = 1;
1446                         page_io_unlock(pp);
1447
1448                         /*
1449                          * Since pages created here do not contain valid
1450                          * data until the caller writes into them, the
1451                          * "exclusive" lock will not be dropped to prevent
1452                          * other users from accessing the page.  We also
1453                          * have to lock the translation to prevent a fault
1454                          * from occurring when the virtual address mapped by
1455                          * this page is written into.  This is necessary to
1456                          * avoid a deadlock since we haven't dropped the
1457                          * "exclusive" lock.
1458                          */
1459                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1460
1461                         /*
1462                          * Large Files: The following assertion is to
1463                          * verify the cast above.
1464                          */
1465                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1466                         smtx = SMAPMTX(smp);
1467                         mutex_enter(smtx);
1468                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1469                         mutex_exit(smtx);
1470
1471                         hat_flag = HAT_LOAD_LOCK;
1472                 } else if (softlock) {
1473                         hat_flag = HAT_LOAD_LOCK;
1474                 }
1475
1476                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1477                         hat_setmod(pp);
1478
1479                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1480
1481                 if (hat_flag != HAT_LOAD_LOCK)
1482                         page_unlock(pp);
1483
1484                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1485                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1486                     seg, addr, pp, vp, off);
1487         }
1488
1489         return (newpage);
1490 }
1491
1492 void
1493 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1494 {
1495         struct smap     *smp;
1496         ushort_t        bitmask;
1497         page_t          *pp;
1498         struct  vnode   *vp;
1499         u_offset_t      off;
1500         caddr_t         eaddr;
1501         kmutex_t        *smtx;
1502
1503         ASSERT(seg->s_as == &kas);
1504
1505         eaddr = addr + len;
1506         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1507
1508         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1509                 /*
1510                  * Pages are successfully prefaulted and locked in
1511                  * segmap_getmapflt and can't be unlocked until
1512                  * segmap_release, so no pages or hat mappings have
1513                  * to be unlocked at this point.
1514                  */
1515 #ifdef DEBUG
1516                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1517                         panic("segmap_pageunlock: smap not found "
1518                             "for addr %p", (void *)addr);
1519                         /*NOTREACHED*/
1520                 }
1521
1522                 ASSERT(smp->sm_refcnt > 0);
1523                 mutex_exit(SMAPMTX(smp));
1524 #endif
1525                 return;
1526         }
1527
1528         smp = GET_SMAP(seg, addr);
1529         smtx = SMAPMTX(smp);
1530
1531         ASSERT(smp->sm_refcnt > 0);
1532
1533         vp = smp->sm_vp;
1534         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1535
1536         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1537                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1538
1539                 /*
1540                  * Large Files: Following assertion is to verify
1541                  * the correctness of the cast to (int) above.
1542                  */
1543                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1544
1545                 /*
1546                  * If the bit corresponding to "off" is set,
1547                  * clear this bit in the bitmap, unlock translations,
1548                  * and release the "exclusive" lock on the page.
1549                  */
1550                 if (smp->sm_bitmap & bitmask) {
1551                         mutex_enter(smtx);
1552                         smp->sm_bitmap &= ~bitmask;
1553                         mutex_exit(smtx);
1554
1555                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1556
1557                         /*
1558                          * Use page_find() instead of page_lookup() to
1559                          * find the page since we know that it has
1560                          * "exclusive" lock.
1561                          */
1562                         pp = page_find(vp, off);
1563                         if (pp == NULL) {
1564                                 panic("segmap_pageunlock: page not found");
1565                                 /*NOTREACHED*/
1566                         }
1567                         if (rw == S_WRITE) {
1568                                 hat_setrefmod(pp);
1569                         } else if (rw != S_OTHER) {
1570                                 hat_setref(pp);
1571                         }
1572
1573                         page_unlock(pp);
1574                 }
1575         }
1576 }
1577
1578 caddr_t
1579 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1580 {
1581         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1582 }
1583
1584 /*
1585  * This is the magic virtual address that offset 0 of an ELF
1586  * file gets mapped to in user space. This is used to pick
1587  * the vac color on the freelist.
1588  */
1589 #define ELF_OFFZERO_VA  (0x10000)
1590 /*
1591  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1592  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1593  * The return address is  always MAXBSIZE aligned.
1594  *
1595  * If forcefault is nonzero and the MMU translations haven't yet been created,
1596  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1597  */
1598 caddr_t
1599 segmap_getmapflt(
1600         struct seg *seg,
1601         struct vnode *vp,
1602         u_offset_t off,
1603         size_t len,
1604         int forcefault,
1605         enum seg_rw rw)
1606 {
1607         struct smap *smp, *nsmp;
1608         extern struct vnode *common_specvp();
1609         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1610         u_offset_t baseoff;
1611         int newslot;
1612         caddr_t vaddr;
1613         int color, hashid;
1614         kmutex_t *hashmtx, *smapmtx;
1615         struct smfree *sm;
1616         page_t  *pp;
1617         struct kpme *kpme;
1618         uint_t  prot;
1619         caddr_t base;
1620         page_t  *pl[MAXPPB + 1];
1621         int     error;
1622         int     is_kpm = 1;
1623
1624         ASSERT(seg->s_as == &kas);
1625         ASSERT(seg == segkmap);
1626
1627         baseoff = off & (offset_t)MAXBMASK;
1628         if (off + len > baseoff + MAXBSIZE) {
1629                 panic("segmap_getmap bad len");
1630                 /*NOTREACHED*/
1631         }
1632
1633         /*
1634          * If this is a block device we have to be sure to use the
1635          * "common" block device vnode for the mapping.
1636          */
1637         if (vp->v_type == VBLK)
1638                 vp = common_specvp(vp);
1639
1640         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1641
1642         if (segmap_kpm == 0 ||
1643             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1644                 is_kpm = 0;
1645         }
1646
1647         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1648         hashmtx = SHASHMTX(hashid);
1649
1650 retry_hash:
1651         mutex_enter(hashmtx);
1652         for (smp = smd_hash[hashid].sh_hash_list;
1653             smp != NULL; smp = smp->sm_hash)
1654                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1655                         break;
1656         mutex_exit(hashmtx);
1657
1658 vrfy_smp:
1659         if (smp != NULL) {
1660
1661                 ASSERT(vp->v_count != 0);
1662
1663                 /*
1664                  * Get smap lock and recheck its tag. The hash lock
1665                  * is dropped since the hash is based on (vp, off)
1666                  * and (vp, off) won't change when we have smap mtx.
1667                  */
1668                 smapmtx = SMAPMTX(smp);
1669                 mutex_enter(smapmtx);
1670                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1671                         mutex_exit(smapmtx);
1672                         goto retry_hash;
1673                 }
1674
1675                 if (smp->sm_refcnt == 0) {
1676
1677                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1678
1679                         /*
1680                          * Could still be on the free list. However, this
1681                          * could also be an smp that is transitioning from
1682                          * the free list when we have too much contention
1683                          * for the smapmtx's. In this case, we have an
1684                          * unlocked smp that is not on the free list any
1685                          * longer, but still has a 0 refcnt.  The only way
1686                          * to be sure is to check the freelist pointers.
1687                          * Since we now have the smapmtx, we are guaranteed
1688                          * that the (vp, off) won't change, so we are safe
1689                          * to reclaim it.  get_free_smp() knows that this
1690                          * can happen, and it will check the refcnt.
1691                          */
1692
1693                         if ((smp->sm_next != NULL)) {
1694                                 struct sm_freeq *freeq;
1695
1696                                 ASSERT(smp->sm_prev != NULL);
1697                                 sm = &smd_free[smp->sm_free_ndx];
1698
1699                                 if (smp->sm_flags & SM_QNDX_ZERO)
1700                                         freeq = &sm->sm_freeq[0];
1701                                 else
1702                                         freeq = &sm->sm_freeq[1];
1703
1704                                 mutex_enter(&freeq->smq_mtx);
1705                                 if (freeq->smq_free != smp) {
1706                                         /*
1707                                          * fastpath normal case
1708                                          */
1709                                         smp->sm_prev->sm_next = smp->sm_next;
1710                                         smp->sm_next->sm_prev = smp->sm_prev;
1711                                 } else if (smp == smp->sm_next) {
1712                                         /*
1713                                          * Taking the last smap on freelist
1714                                          */
1715                                         freeq->smq_free = NULL;
1716                                 } else {
1717                                         /*
1718                                          * Reclaiming 1st smap on list
1719                                          */
1720                                         freeq->smq_free = smp->sm_next;
1721                                         smp->sm_prev->sm_next = smp->sm_next;
1722                                         smp->sm_next->sm_prev = smp->sm_prev;
1723                                 }
1724                                 mutex_exit(&freeq->smq_mtx);
1725                                 smp->sm_prev = smp->sm_next = NULL;
1726                         } else {
1727                                 ASSERT(smp->sm_prev == NULL);
1728                                 segmapcnt.smp_stolen.value.ul++;
1729                         }
1730
1731                 } else {
1732                         segmapcnt.smp_get_use.value.ul++;
1733                 }
1734                 smp->sm_refcnt++;               /* another user */
1735
1736                 /*
1737                  * We don't invoke segmap_fault via TLB miss, so we set ref
1738                  * and mod bits in advance. For S_OTHER  we set them in
1739                  * segmap_fault F_SOFTUNLOCK.
1740                  */
1741                 if (is_kpm) {
1742                         if (rw == S_WRITE) {
1743                                 smp->sm_flags |= SM_WRITE_DATA;
1744                         } else if (rw == S_READ) {
1745                                 smp->sm_flags |= SM_READ_DATA;
1746                         }
1747                 }
1748                 mutex_exit(smapmtx);
1749
1750                 newslot = 0;
1751         } else {
1752
1753                 uint32_t free_ndx, *free_ndxp;
1754                 union segmap_cpu *scpu;
1755
1756                 /*
1757                  * On a PAC machine or a machine with anti-alias
1758                  * hardware, smd_colormsk will be zero.
1759                  *
1760                  * On a VAC machine- pick color by offset in the file
1761                  * so we won't get VAC conflicts on elf files.
1762                  * On data files, color does not matter but we
1763                  * don't know what kind of file it is so we always
1764                  * pick color by offset. This causes color
1765                  * corresponding to file offset zero to be used more
1766                  * heavily.
1767                  */
1768                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1769                 scpu = smd_cpu+CPU->cpu_seqid;
1770                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1771                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1772 #ifdef DEBUG
1773                 colors_used[free_ndx]++;
1774 #endif /* DEBUG */
1775
1776                 /*
1777                  * Get a locked smp slot from the free list.
1778                  */
1779                 smp = get_free_smp(free_ndx);
1780                 smapmtx = SMAPMTX(smp);
1781
1782                 ASSERT(smp->sm_vp == NULL);
1783
1784                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1785                         /*
1786                          * Failed to hashin, there exists one now.
1787                          * Return the smp we just allocated.
1788                          */
1789                         segmap_smapadd(smp);
1790                         mutex_exit(smapmtx);
1791
1792                         smp = nsmp;
1793                         goto vrfy_smp;
1794                 }
1795                 smp->sm_refcnt++;               /* another user */
1796
1797                 /*
1798                  * We don't invoke segmap_fault via TLB miss, so we set ref
1799                  * and mod bits in advance. For S_OTHER  we set them in
1800                  * segmap_fault F_SOFTUNLOCK.
1801                  */
1802                 if (is_kpm) {
1803                         if (rw == S_WRITE) {
1804                                 smp->sm_flags |= SM_WRITE_DATA;
1805                         } else if (rw == S_READ) {
1806                                 smp->sm_flags |= SM_READ_DATA;
1807                         }
1808                 }
1809                 mutex_exit(smapmtx);
1810
1811                 newslot = 1;
1812         }
1813
1814         if (!is_kpm)
1815                 goto use_segmap_range;
1816
1817         /*
1818          * Use segkpm
1819          */
1820         /* Lint directive required until 6746211 is fixed */
1821         /*CONSTCOND*/
1822         ASSERT(PAGESIZE == MAXBSIZE);
1823
1824         /*
1825          * remember the last smp faulted on this cpu.
1826          */
1827         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1828
1829         if (forcefault == SM_PAGECREATE) {
1830                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1831                 return (baseaddr);
1832         }
1833
1834         if (newslot == 0 &&
1835             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1836
1837                 /* fastpath */
1838                 switch (rw) {
1839                 case S_READ:
1840                 case S_WRITE:
1841                         if (page_trylock(pp, SE_SHARED)) {
1842                                 if (PP_ISFREE(pp) ||
1843                                     !(pp->p_vnode == vp &&
1844                                     pp->p_offset == baseoff)) {
1845                                         page_unlock(pp);
1846                                         pp = page_lookup(vp, baseoff,
1847                                             SE_SHARED);
1848                                 }
1849                         } else {
1850                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1851                         }
1852
1853                         if (pp == NULL) {
1854                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1855                                 break;
1856                         }
1857
1858                         if (rw == S_WRITE &&
1859                             hat_page_getattr(pp, P_MOD | P_REF) !=
1860                             (P_MOD | P_REF)) {
1861                                 page_unlock(pp);
1862                                 break;
1863                         }
1864
1865                         /*
1866                          * We have the p_selock as reader, grab_smp
1867                          * can't hit us, we have bumped the smap
1868                          * refcnt and hat_pageunload needs the
1869                          * p_selock exclusive.
1870                          */
1871                         kpme = GET_KPME(smp);
1872                         if (kpme->kpe_page == pp) {
1873                                 baseaddr = hat_kpm_page2va(pp, 0);
1874                         } else if (kpme->kpe_page == NULL) {
1875                                 baseaddr = hat_kpm_mapin(pp, kpme);
1876                         } else {
1877                                 panic("segmap_getmapflt: stale "
1878                                     "kpme page, kpme %p", (void *)kpme);
1879                                 /*NOTREACHED*/
1880                         }
1881
1882                         /*
1883                          * We don't invoke segmap_fault via TLB miss,
1884                          * so we set ref and mod bits in advance.
1885                          * For S_OTHER and we set them in segmap_fault
1886                          * F_SOFTUNLOCK.
1887                          */
1888                         if (rw == S_READ && !hat_isref(pp))
1889                                 hat_setref(pp);
1890
1891                         return (baseaddr);
1892                 default:
1893                         break;
1894                 }
1895         }
1896
1897         base = segkpm_create_va(baseoff);
1898         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1899             seg, base, rw, CRED(), NULL);
1900
1901         pp = pl[0];
1902         if (error || pp == NULL) {
1903                 /*
1904                  * Use segmap address slot and let segmap_fault deal
1905                  * with the error cases. There is no error return
1906                  * possible here.
1907                  */
1908                 goto use_segmap_range;
1909         }
1910
1911         ASSERT(pl[1] == NULL);
1912
1913         /*
1914          * When prot is not returned w/ PROT_ALL the returned pages
1915          * are not backed by fs blocks. For most of the segmap users
1916          * this is no problem, they don't write to the pages in the
1917          * same request and therefore don't rely on a following
1918          * trap driven segmap_fault. With SM_LOCKPROTO users it
1919          * is more secure to use segkmap adresses to allow
1920          * protection segmap_fault's.
1921          */
1922         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1923                 /*
1924                  * Use segmap address slot and let segmap_fault
1925                  * do the error return.
1926                  */
1927                 ASSERT(rw != S_WRITE);
1928                 ASSERT(PAGE_LOCKED(pp));
1929                 page_unlock(pp);
1930                 forcefault = 0;
1931                 goto use_segmap_range;
1932         }
1933
1934         /*
1935          * We have the p_selock as reader, grab_smp can't hit us, we
1936          * have bumped the smap refcnt and hat_pageunload needs the
1937          * p_selock exclusive.
1938          */
1939         kpme = GET_KPME(smp);
1940         if (kpme->kpe_page == pp) {
1941                 baseaddr = hat_kpm_page2va(pp, 0);
1942         } else if (kpme->kpe_page == NULL) {
1943                 baseaddr = hat_kpm_mapin(pp, kpme);
1944         } else {
1945                 panic("segmap_getmapflt: stale kpme page after "
1946                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1947                 /*NOTREACHED*/
1948         }
1949
1950         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1951
1952         return (baseaddr);
1953
1954
1955 use_segmap_range:
1956         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1957         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1958             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1959             seg, baseaddr, vp, baseoff);
1960
1961         /*
1962          * Prefault the translations
1963          */
1964         vaddr = baseaddr + (off - baseoff);
1965         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1966
1967                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1968                     (uintptr_t)PAGEMASK);
1969
1970                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1971                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1972                     F_INVAL, rw);
1973         }
1974
1975         return (baseaddr);
1976 }
1977
1978 int
1979 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1980 {
1981         struct smap     *smp;
1982         int             error;
1983         int             bflags = 0;
1984         struct vnode    *vp;
1985         u_offset_t      offset;
1986         kmutex_t        *smtx;
1987         int             is_kpm = 0;
1988         page_t          *pp;
1989
1990         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1991
1992                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1993                         panic("segmap_release: addr %p not "
1994                             "MAXBSIZE aligned", (void *)addr);
1995                         /*NOTREACHED*/
1996                 }
1997
1998                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1999                         panic("segmap_release: smap not found "
2000                             "for addr %p", (void *)addr);
2001                         /*NOTREACHED*/
2002                 }
2003
2004                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2005                     "segmap_relmap:seg %p addr %p smp %p",
2006                     seg, addr, smp);
2007
2008                 smtx = SMAPMTX(smp);
2009
2010                 /*
2011                  * For compatibility reasons segmap_pagecreate_kpm sets this
2012                  * flag to allow a following segmap_pagecreate to return
2013                  * this as "newpage" flag. When segmap_pagecreate is not
2014                  * called at all we clear it now.
2015                  */
2016                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
2017                 is_kpm = 1;
2018                 if (smp->sm_flags & SM_WRITE_DATA) {
2019                         hat_setrefmod(pp);
2020                 } else if (smp->sm_flags & SM_READ_DATA) {
2021                         hat_setref(pp);
2022                 }
2023         } else {
2024                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2025                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2026                         panic("segmap_release: bad addr %p", (void *)addr);
2027                         /*NOTREACHED*/
2028                 }
2029                 smp = GET_SMAP(seg, addr);
2030
2031                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2032                     "segmap_relmap:seg %p addr %p smp %p",
2033                     seg, addr, smp);
2034
2035                 smtx = SMAPMTX(smp);
2036                 mutex_enter(smtx);
2037                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2038         }
2039
2040         ASSERT(smp->sm_refcnt > 0);
2041
2042         /*
2043          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2044          * are set.
2045          */
2046         if ((flags & ~SM_DONTNEED) != 0) {
2047                 if (flags & SM_WRITE)
2048                         segmapcnt.smp_rel_write.value.ul++;
2049                 if (flags & SM_ASYNC) {
2050                         bflags |= B_ASYNC;
2051                         segmapcnt.smp_rel_async.value.ul++;
2052                 }
2053                 if (flags & SM_INVAL) {
2054                         bflags |= B_INVAL;
2055                         segmapcnt.smp_rel_abort.value.ul++;
2056                 }
2057                 if (flags & SM_DESTROY) {
2058                         bflags |= (B_INVAL|B_TRUNC);
2059                         segmapcnt.smp_rel_abort.value.ul++;
2060                 }
2061                 if (smp->sm_refcnt == 1) {
2062                         /*
2063                          * We only bother doing the FREE and DONTNEED flags
2064                          * if no one else is still referencing this mapping.
2065                          */
2066                         if (flags & SM_FREE) {
2067                                 bflags |= B_FREE;
2068                                 segmapcnt.smp_rel_free.value.ul++;
2069                         }
2070                         if (flags & SM_DONTNEED) {
2071                                 bflags |= B_DONTNEED;
2072                                 segmapcnt.smp_rel_dontneed.value.ul++;
2073                         }
2074                 }
2075         } else {
2076                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2077         }
2078
2079         vp = smp->sm_vp;
2080         offset = smp->sm_off;
2081
2082         if (--smp->sm_refcnt == 0) {
2083
2084                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2085
2086                 if (flags & (SM_INVAL|SM_DESTROY)) {
2087                         segmap_hashout(smp);    /* remove map info */
2088                         if (is_kpm) {
2089                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2090                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2091                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2092                                         hat_unload(kas.a_hat, segkmap->s_base +
2093                                             ((smp - smd_smap) * MAXBSIZE),
2094                                             MAXBSIZE, HAT_UNLOAD);
2095                                 }
2096
2097                         } else {
2098                                 if (segmap_kpm)
2099                                         segkpm_mapout_validkpme(GET_KPME(smp));
2100
2101                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2102                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2103                                     HAT_UNLOAD);
2104                         }
2105                 }
2106                 segmap_smapadd(smp);    /* add to free list */
2107         }
2108
2109         mutex_exit(smtx);
2110
2111         if (is_kpm)
2112                 page_unlock(pp);
2113         /*
2114          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2115          * are set.
2116          */
2117         if ((flags & ~SM_DONTNEED) != 0) {
2118                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2119                     bflags, CRED(), NULL);
2120         } else {
2121                 error = 0;
2122         }
2123
2124         return (error);
2125 }
2126
2127 /*
2128  * Dump the pages belonging to this segmap segment.
2129  */
2130 static void
2131 segmap_dump(struct seg *seg)
2132 {
2133         struct segmap_data *smd;
2134         struct smap *smp, *smp_end;
2135         page_t *pp;
2136         pfn_t pfn;
2137         u_offset_t off;
2138         caddr_t addr;
2139
2140         smd = (struct segmap_data *)seg->s_data;
2141         addr = seg->s_base;
2142         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2143             smp < smp_end; smp++) {
2144
2145                 if (smp->sm_refcnt) {
2146                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2147                                 int we_own_it = 0;
2148
2149                                 /*
2150                                  * If pp == NULL, the page either does
2151                                  * not exist or is exclusively locked.
2152                                  * So determine if it exists before
2153                                  * searching for it.
2154                                  */
2155                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2156                                     smp->sm_off + off, SE_SHARED)))
2157                                         we_own_it = 1;
2158                                 else
2159                                         pp = page_exists(smp->sm_vp,
2160                                             smp->sm_off + off);
2161
2162                                 if (pp) {
2163                                         pfn = page_pptonum(pp);
2164                                         dump_addpage(seg->s_as,
2165                                             addr + off, pfn);
2166                                         if (we_own_it)
2167                                                 page_unlock(pp);
2168                                 }
2169                                 dump_timeleft = dump_timeout;
2170                         }
2171                 }
2172                 addr += MAXBSIZE;
2173         }
2174 }
2175
2176 /*ARGSUSED*/
2177 static int
2178 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2179     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2180 {
2181         return (ENOTSUP);
2182 }
2183
2184 static int
2185 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2186 {
2187         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2188
2189         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2190         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2191         return (0);
2192 }
2193
2194 /*ARGSUSED*/
2195 static lgrp_mem_policy_info_t *
2196 segmap_getpolicy(struct seg *seg, caddr_t addr)
2197 {
2198         return (NULL);
2199 }
2200
2201 /*ARGSUSED*/
2202 static int
2203 segmap_capable(struct seg *seg, segcapability_t capability)
2204 {
2205         return (0);
2206 }
2207
2208
2209 #ifdef  SEGKPM_SUPPORT
2210
2211 /*
2212  * segkpm support routines
2213  */
2214
2215 static caddr_t
2216 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2217         struct smap *smp, enum seg_rw rw)
2218 {
2219         caddr_t base;
2220         page_t  *pp;
2221         int     newpage = 0;
2222         struct kpme     *kpme;
2223
2224         ASSERT(smp->sm_refcnt > 0);
2225
2226         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2227                 kmutex_t *smtx;
2228
2229                 base = segkpm_create_va(off);
2230
2231                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2232                     seg, base)) == NULL) {
2233                         panic("segmap_pagecreate_kpm: "
2234                             "page_create failed");
2235                         /*NOTREACHED*/
2236                 }
2237
2238                 newpage = 1;
2239                 page_io_unlock(pp);
2240                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2241
2242                 /*
2243                  * Mark this here until the following segmap_pagecreate
2244                  * or segmap_release.
2245                  */
2246                 smtx = SMAPMTX(smp);
2247                 mutex_enter(smtx);
2248                 smp->sm_flags |= SM_KPM_NEWPAGE;
2249                 mutex_exit(smtx);
2250         }
2251
2252         kpme = GET_KPME(smp);
2253         if (!newpage && kpme->kpe_page == pp)
2254                 base = hat_kpm_page2va(pp, 0);
2255         else
2256                 base = hat_kpm_mapin(pp, kpme);
2257
2258         /*
2259          * FS code may decide not to call segmap_pagecreate and we
2260          * don't invoke segmap_fault via TLB miss, so we have to set
2261          * ref and mod bits in advance.
2262          */
2263         if (rw == S_WRITE) {
2264                 hat_setrefmod(pp);
2265         } else {
2266                 ASSERT(rw == S_READ);
2267                 hat_setref(pp);
2268         }
2269
2270         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2271
2272         return (base);
2273 }
2274
2275 /*
2276  * Find the smap structure corresponding to the
2277  * KPM addr and return it locked.
2278  */
2279 struct smap *
2280 get_smap_kpm(caddr_t addr, page_t **ppp)
2281 {
2282         struct smap     *smp;
2283         struct vnode    *vp;
2284         u_offset_t      offset;
2285         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2286         int             hashid;
2287         kmutex_t        *hashmtx;
2288         page_t          *pp;
2289         union segmap_cpu *scpu;
2290
2291         pp = hat_kpm_vaddr2page(baseaddr);
2292
2293         ASSERT(pp && !PP_ISFREE(pp));
2294         ASSERT(PAGE_LOCKED(pp));
2295         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2296
2297         vp = pp->p_vnode;
2298         offset = pp->p_offset;
2299         ASSERT(vp != NULL);
2300
2301         /*
2302          * Assume the last smap used on this cpu is the one needed.
2303          */
2304         scpu = smd_cpu+CPU->cpu_seqid;
2305         smp = scpu->scpu.scpu_last_smap;
2306         mutex_enter(&smp->sm_mtx);
2307         if (smp->sm_vp == vp && smp->sm_off == offset) {
2308                 ASSERT(smp->sm_refcnt > 0);
2309         } else {
2310                 /*
2311                  * Assumption wrong, find the smap on the hash chain.
2312                  */
2313                 mutex_exit(&smp->sm_mtx);
2314                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2315                 hashmtx = SHASHMTX(hashid);
2316
2317                 mutex_enter(hashmtx);
2318                 smp = smd_hash[hashid].sh_hash_list;
2319                 for (; smp != NULL; smp = smp->sm_hash) {
2320                         if (smp->sm_vp == vp && smp->sm_off == offset)
2321                                 break;
2322                 }
2323                 mutex_exit(hashmtx);
2324                 if (smp) {
2325                         mutex_enter(&smp->sm_mtx);
2326                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2327                 }
2328         }
2329
2330         if (ppp)
2331                 *ppp = smp ? pp : NULL;
2332
2333         return (smp);
2334 }
2335
2336 #else   /* SEGKPM_SUPPORT */
2337
2338 /* segkpm stubs */
2339
2340 /*ARGSUSED*/
2341 static caddr_t
2342 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2343         struct smap *smp, enum seg_rw rw)
2344 {
2345         return (NULL);
2346 }
2347
2348 /*ARGSUSED*/
2349 struct smap *
2350 get_smap_kpm(caddr_t addr, page_t **ppp)
2351 {
2352         return (NULL);
2353 }
2354
2355 #endif  /* SEGKPM_SUPPORT */