usr/src/uts/common/vm/seg_map.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * Portions of this source code were derived from Berkeley 4.3 BSD
  31  * under license from the Regents of the University of California.
  32  */
  33
  34 /*
  35  * VM - generic vnode mapping segment.
  36  *
  37  * The segmap driver is used only by the kernel to get faster (than seg_vn)
  38  * mappings [lower routine overhead; more persistent cache] to random
  39  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
  40  */
  41
  42 #include <sys/types.h>
  43 #include <sys/t_lock.h>
  44 #include <sys/param.h>
  45 #include <sys/sysmacros.h>
  46 #include <sys/buf.h>
  47 #include <sys/systm.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mman.h>
  50 #include <sys/errno.h>
  51 #include <sys/cred.h>
  52 #include <sys/kmem.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/debug.h>
  56 #include <sys/thread.h>
  57 #include <sys/dumphdr.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/lgrp.h>
  60
  61 #include <vm/seg_kmem.h>
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_kpm.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/page.h>
  68 #include <vm/pvn.h>
  69 #include <vm/rm.h>
  70
  71 /*
  72  * Private seg op routines.
  73  */
  74 static void     segmap_free(struct seg *seg);
  75 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  76                         size_t len, enum fault_type type, enum seg_rw rw);
  77 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
  78 static int      segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
  79                         uint_t prot);
  80 static int      segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
  81 static int      segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
  82                         uint_t *protv);
  83 static u_offset_t       segmap_getoffset(struct seg *seg, caddr_t addr);
  84 static int      segmap_gettype(struct seg *seg, caddr_t addr);
  85 static int      segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
  86 static void     segmap_dump(struct seg *seg);
  87 static int      segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
  88                         struct page ***ppp, enum lock_type type,
  89                         enum seg_rw rw);
  90 static void     segmap_badop(void);
  91 static int      segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
  92 static lgrp_mem_policy_info_t   *segmap_getpolicy(struct seg *seg,
  93     caddr_t addr);
  94 static int      segmap_capable(struct seg *seg, segcapability_t capability);
  95
  96 /* segkpm support */
  97 static caddr_t  segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
  98                         struct smap *, enum seg_rw);
  99 struct smap     *get_smap_kpm(caddr_t, page_t **);
 100
 101 #define SEGMAP_BADOP(t) (t(*)())segmap_badop
 102
 103 static struct seg_ops segmap_ops = {
 104         SEGMAP_BADOP(int),      /* dup */
 105         SEGMAP_BADOP(int),      /* unmap */
 106         segmap_free,
 107         segmap_fault,
 108         segmap_faulta,
 109         SEGMAP_BADOP(int),      /* setprot */
 110         segmap_checkprot,
 111         segmap_kluster,
 112         SEGMAP_BADOP(size_t),   /* swapout */
 113         SEGMAP_BADOP(int),      /* sync */
 114         SEGMAP_BADOP(size_t),   /* incore */
 115         SEGMAP_BADOP(int),      /* lockop */
 116         segmap_getprot,
 117         segmap_getoffset,
 118         segmap_gettype,
 119         segmap_getvp,
 120         SEGMAP_BADOP(int),      /* advise */
 121         segmap_dump,
 122         segmap_pagelock,        /* pagelock */
 123         SEGMAP_BADOP(int),      /* setpgsz */
 124         segmap_getmemid,        /* getmemid */
 125         segmap_getpolicy,       /* getpolicy */
 126         segmap_capable,         /* capable */
 127 };
 128
 129 /*
 130  * Private segmap routines.
 131  */
 132 static void     segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
 133                         size_t len, enum seg_rw rw, struct smap *smp);
 134 static void     segmap_smapadd(struct smap *smp);
 135 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
 136                         u_offset_t off, int hashid);
 137 static void     segmap_hashout(struct smap *smp);
 138
 139
 140 /*
 141  * Statistics for segmap operations.
 142  *
 143  * No explicit locking to protect these stats.
 144  */
 145 struct segmapcnt segmapcnt = {
 146         { "fault",              KSTAT_DATA_ULONG },
 147         { "faulta",             KSTAT_DATA_ULONG },
 148         { "getmap",             KSTAT_DATA_ULONG },
 149         { "get_use",            KSTAT_DATA_ULONG },
 150         { "get_reclaim",        KSTAT_DATA_ULONG },
 151         { "get_reuse",          KSTAT_DATA_ULONG },
 152         { "get_unused",         KSTAT_DATA_ULONG },
 153         { "get_nofree",         KSTAT_DATA_ULONG },
 154         { "rel_async",          KSTAT_DATA_ULONG },
 155         { "rel_write",          KSTAT_DATA_ULONG },
 156         { "rel_free",           KSTAT_DATA_ULONG },
 157         { "rel_abort",          KSTAT_DATA_ULONG },
 158         { "rel_dontneed",       KSTAT_DATA_ULONG },
 159         { "release",            KSTAT_DATA_ULONG },
 160         { "pagecreate",         KSTAT_DATA_ULONG },
 161         { "free_notfree",       KSTAT_DATA_ULONG },
 162         { "free_dirty",         KSTAT_DATA_ULONG },
 163         { "free",               KSTAT_DATA_ULONG },
 164         { "stolen",             KSTAT_DATA_ULONG },
 165         { "get_nomtx",          KSTAT_DATA_ULONG }
 166 };
 167
 168 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
 169 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
 170
 171 /*
 172  * Return number of map pages in segment.
 173  */
 174 #define MAP_PAGES(seg)          ((seg)->s_size >> MAXBSHIFT)
 175
 176 /*
 177  * Translate addr into smap number within segment.
 178  */
 179 #define MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
 180
 181 /*
 182  * Translate addr in seg into struct smap pointer.
 183  */
 184 #define GET_SMAP(seg, addr)     \
 185         &(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
 186
 187 /*
 188  * Bit in map (16 bit bitmap).
 189  */
 190 #define SMAP_BIT_MASK(bitindex) (1 << ((bitindex) & 0xf))
 191
 192 static int smd_colormsk = 0;
 193 static int smd_ncolor = 0;
 194 static int smd_nfree = 0;
 195 static int smd_freemsk = 0;
 196 #ifdef DEBUG
 197 static int *colors_used;
 198 #endif
 199 static struct smap *smd_smap;
 200 static struct smaphash *smd_hash;
 201 #ifdef SEGMAP_HASHSTATS
 202 static unsigned int *smd_hash_len;
 203 #endif
 204 static struct smfree *smd_free;
 205 static ulong_t smd_hashmsk = 0;
 206
 207 #define SEGMAP_MAXCOLOR         2
 208 #define SEGMAP_CACHE_PAD        64
 209
 210 union segmap_cpu {
 211         struct {
 212                 uint32_t        scpu_free_ndx[SEGMAP_MAXCOLOR];
 213                 struct smap     *scpu_last_smap;
 214                 ulong_t         scpu_getmap;
 215                 ulong_t         scpu_release;
 216                 ulong_t         scpu_get_reclaim;
 217                 ulong_t         scpu_fault;
 218                 ulong_t         scpu_pagecreate;
 219                 ulong_t         scpu_get_reuse;
 220         } scpu;
 221         char    scpu_pad[SEGMAP_CACHE_PAD];
 222 };
 223 static union segmap_cpu *smd_cpu;
 224
 225 /*
 226  * There are three locks in seg_map:
 227  *      - per freelist mutexes
 228  *      - per hashchain mutexes
 229  *      - per smap mutexes
 230  *
 231  * The lock ordering is to get the smap mutex to lock down the slot
 232  * first then the hash lock (for hash in/out (vp, off) list) or the
 233  * freelist lock to put the slot back on the free list.
 234  *
 235  * The hash search is done by only holding the hashchain lock, when a wanted
 236  * slot is found, we drop the hashchain lock then lock the slot so there
 237  * is no overlapping of hashchain and smap locks. After the slot is
 238  * locked, we verify again if the slot is still what we are looking
 239  * for.
 240  *
 241  * Allocation of a free slot is done by holding the freelist lock,
 242  * then locking the smap slot at the head of the freelist. This is
 243  * in reversed lock order so mutex_tryenter() is used.
 244  *
 245  * The smap lock protects all fields in smap structure except for
 246  * the link fields for hash/free lists which are protected by
 247  * hashchain and freelist locks.
 248  */
 249
 250 #define SHASHMTX(hashid)        (&smd_hash[hashid].sh_mtx)
 251
 252 #define SMP2SMF(smp)            (&smd_free[(smp - smd_smap) & smd_freemsk])
 253 #define SMP2SMF_NDX(smp)        (ushort_t)((smp - smd_smap) & smd_freemsk)
 254
 255 #define SMAPMTX(smp) (&smp->sm_mtx)
 256
 257 #define SMAP_HASHFUNC(vp, off, hashid) \
 258         { \
 259         hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
 260                 ((off) >> MAXBSHIFT)) & smd_hashmsk); \
 261         }
 262
 263 /*
 264  * The most frequently updated kstat counters are kept in the
 265  * per cpu array to avoid hot cache blocks. The update function
 266  * sums the cpu local counters to update the global counters.
 267  */
 268
 269 /* ARGSUSED */
 270 int
 271 segmap_kstat_update(kstat_t *ksp, int rw)
 272 {
 273         int i;
 274         ulong_t getmap, release, get_reclaim;
 275         ulong_t fault, pagecreate, get_reuse;
 276
 277         if (rw == KSTAT_WRITE)
 278                 return (EACCES);
 279         getmap = release = get_reclaim = (ulong_t)0;
 280         fault = pagecreate = get_reuse = (ulong_t)0;
 281         for (i = 0; i < max_ncpus; i++) {
 282                 getmap += smd_cpu[i].scpu.scpu_getmap;
 283                 release  += smd_cpu[i].scpu.scpu_release;
 284                 get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
 285                 fault  += smd_cpu[i].scpu.scpu_fault;
 286                 pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
 287                 get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
 288         }
 289         segmapcnt.smp_getmap.value.ul = getmap;
 290         segmapcnt.smp_release.value.ul = release;
 291         segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
 292         segmapcnt.smp_fault.value.ul = fault;
 293         segmapcnt.smp_pagecreate.value.ul = pagecreate;
 294         segmapcnt.smp_get_reuse.value.ul = get_reuse;
 295         return (0);
 296 }
 297
 298 int
 299 segmap_create(struct seg *seg, void *argsp)
 300 {
 301         struct segmap_data *smd;
 302         struct smap *smp;
 303         struct smfree *sm;
 304         struct segmap_crargs *a = (struct segmap_crargs *)argsp;
 305         struct smaphash *shashp;
 306         union segmap_cpu *scpu;
 307         long i, npages;
 308         size_t hashsz;
 309         uint_t nfreelist;
 310         extern void prefetch_smap_w(void *);
 311         extern int max_ncpus;
 312
 313         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 314
 315         if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
 316                 panic("segkmap not MAXBSIZE aligned");
 317                 /*NOTREACHED*/
 318         }
 319
 320         smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
 321
 322         seg->s_data = (void *)smd;
 323         seg->s_ops = &segmap_ops;
 324         smd->smd_prot = a->prot;
 325
 326         /*
 327          * Scale the number of smap freelists to be
 328          * proportional to max_ncpus * number of virtual colors.
 329          * The caller can over-ride this scaling by providing
 330          * a non-zero a->nfreelist argument.
 331          */
 332         nfreelist = a->nfreelist;
 333         if (nfreelist == 0)
 334                 nfreelist = max_ncpus;
 335         else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
 336                 cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
 337                 "%d, using %d", nfreelist, max_ncpus);
 338                 nfreelist = max_ncpus;
 339         }
 340         if (!ISP2(nfreelist)) {
 341                 /* round up nfreelist to the next power of two. */
 342                 nfreelist = 1 << (highbit(nfreelist));
 343         }
 344
 345         /*
 346          * Get the number of virtual colors - must be a power of 2.
 347          */
 348         if (a->shmsize)
 349                 smd_ncolor = a->shmsize >> MAXBSHIFT;
 350         else
 351                 smd_ncolor = 1;
 352         ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
 353         ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
 354         smd_colormsk = smd_ncolor - 1;
 355         smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
 356         smd_freemsk = smd_nfree - 1;
 357
 358         /*
 359          * Allocate and initialize the freelist headers.
 360          * Note that sm_freeq[1] starts out as the release queue. This
 361          * is known when the smap structures are initialized below.
 362          */
 363         smd_free = smd->smd_free =
 364             kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
 365         for (i = 0; i < smd_nfree; i++) {
 366                 sm = &smd->smd_free[i];
 367                 mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 368                 mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
 369                 sm->sm_allocq = &sm->sm_freeq[0];
 370                 sm->sm_releq = &sm->sm_freeq[1];
 371         }
 372
 373         /*
 374          * Allocate and initialize the smap hash chain headers.
 375          * Compute hash size rounding down to the next power of two.
 376          */
 377         npages = MAP_PAGES(seg);
 378         smd->smd_npages = npages;
 379         hashsz = npages / SMAP_HASHAVELEN;
 380         hashsz = 1 << (highbit(hashsz)-1);
 381         smd_hashmsk = hashsz - 1;
 382         smd_hash = smd->smd_hash =
 383             kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
 384 #ifdef SEGMAP_HASHSTATS
 385         smd_hash_len =
 386             kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
 387 #endif
 388         for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
 389                 shashp->sh_hash_list = NULL;
 390                 mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
 391         }
 392
 393         /*
 394          * Allocate and initialize the smap structures.
 395          * Link all slots onto the appropriate freelist.
 396          * The smap array is large enough to affect boot time
 397          * on large systems, so use memory prefetching and only
 398          * go through the array 1 time. Inline a optimized version
 399          * of segmap_smapadd to add structures to freelists with
 400          * knowledge that no locks are needed here.
 401          */
 402         smd_smap = smd->smd_sm =
 403             kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
 404
 405         for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
 406             smp >= smd->smd_sm; smp--) {
 407                 struct smap *smpfreelist;
 408                 struct sm_freeq *releq;
 409
 410                 prefetch_smap_w((char *)smp);
 411
 412                 smp->sm_vp = NULL;
 413                 smp->sm_hash = NULL;
 414                 smp->sm_off = 0;
 415                 smp->sm_bitmap = 0;
 416                 smp->sm_refcnt = 0;
 417                 mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
 418                 smp->sm_free_ndx = SMP2SMF_NDX(smp);
 419
 420                 sm = SMP2SMF(smp);
 421                 releq = sm->sm_releq;
 422
 423                 smpfreelist = releq->smq_free;
 424                 if (smpfreelist == 0) {
 425                         releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 426                 } else {
 427                         smp->sm_next = smpfreelist;
 428                         smp->sm_prev = smpfreelist->sm_prev;
 429                         smpfreelist->sm_prev = smp;
 430                         smp->sm_prev->sm_next = smp;
 431                         releq->smq_free = smp->sm_next;
 432                 }
 433
 434                 /*
 435                  * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
 436                  */
 437                 smp->sm_flags = 0;
 438
 439 #ifdef  SEGKPM_SUPPORT
 440                 /*
 441                  * Due to the fragile prefetch loop no
 442                  * separate function is used here.
 443                  */
 444                 smp->sm_kpme_next = NULL;
 445                 smp->sm_kpme_prev = NULL;
 446                 smp->sm_kpme_page = NULL;
 447 #endif
 448         }
 449
 450         /*
 451          * Allocate the per color indices that distribute allocation
 452          * requests over the free lists. Each cpu will have a private
 453          * rotor index to spread the allocations even across the available
 454          * smap freelists. Init the scpu_last_smap field to the first
 455          * smap element so there is no need to check for NULL.
 456          */
 457         smd_cpu =
 458             kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
 459         for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
 460                 int j;
 461                 for (j = 0; j < smd_ncolor; j++)
 462                         scpu->scpu.scpu_free_ndx[j] = j;
 463                 scpu->scpu.scpu_last_smap = smd_smap;
 464         }
 465
 466         vpm_init();
 467
 468 #ifdef DEBUG
 469         /*
 470          * Keep track of which colors are used more often.
 471          */
 472         colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
 473 #endif /* DEBUG */
 474
 475         return (0);
 476 }
 477
 478 static void
 479 segmap_free(seg)
 480         struct seg *seg;
 481 {
 482         ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
 483 }
 484
 485 /*
 486  * Do a F_SOFTUNLOCK call over the range requested.
 487  * The range must have already been F_SOFTLOCK'ed.
 488  */
 489 static void
 490 segmap_unlock(
 491         struct hat *hat,
 492         struct seg *seg,
 493         caddr_t addr,
 494         size_t len,
 495         enum seg_rw rw,
 496         struct smap *smp)
 497 {
 498         page_t *pp;
 499         caddr_t adr;
 500         u_offset_t off;
 501         struct vnode *vp;
 502         kmutex_t *smtx;
 503
 504         ASSERT(smp->sm_refcnt > 0);
 505
 506 #ifdef lint
 507         seg = seg;
 508 #endif
 509
 510         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 511
 512                 /*
 513                  * We're called only from segmap_fault and this was a
 514                  * NOP in case of a kpm based smap, so dangerous things
 515                  * must have happened in the meantime. Pages are prefaulted
 516                  * and locked in segmap_getmapflt and they will not be
 517                  * unlocked until segmap_release.
 518                  */
 519                 panic("segmap_unlock: called with kpm addr %p", (void *)addr);
 520                 /*NOTREACHED*/
 521         }
 522
 523         vp = smp->sm_vp;
 524         off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 525
 526         hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
 527         for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
 528                 ushort_t bitmask;
 529
 530                 /*
 531                  * Use page_find() instead of page_lookup() to
 532                  * find the page since we know that it has
 533                  * "shared" lock.
 534                  */
 535                 pp = page_find(vp, off);
 536                 if (pp == NULL) {
 537                         panic("segmap_unlock: page not found");
 538                         /*NOTREACHED*/
 539                 }
 540
 541                 if (rw == S_WRITE) {
 542                         hat_setrefmod(pp);
 543                 } else if (rw != S_OTHER) {
 544                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 545                         "segmap_fault:pp %p vp %p offset %llx", pp, vp, off);
 546                         hat_setref(pp);
 547                 }
 548
 549                 /*
 550                  * Clear bitmap, if the bit corresponding to "off" is set,
 551                  * since the page and translation are being unlocked.
 552                  */
 553                 bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
 554
 555                 /*
 556                  * Large Files: Following assertion is to verify
 557                  * the correctness of the cast to (int) above.
 558                  */
 559                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
 560                 smtx = SMAPMTX(smp);
 561                 mutex_enter(smtx);
 562                 if (smp->sm_bitmap & bitmask) {
 563                         smp->sm_bitmap &= ~bitmask;
 564                 }
 565                 mutex_exit(smtx);
 566
 567                 page_unlock(pp);
 568         }
 569 }
 570
 571 #define MAXPPB  (MAXBSIZE/4096) /* assumes minimum page size of 4k */
 572
 573 /*
 574  * This routine is called via a machine specific fault handling
 575  * routine.  It is also called by software routines wishing to
 576  * lock or unlock a range of addresses.
 577  *
 578  * Note that this routine expects a page-aligned "addr".
 579  */
 580 faultcode_t
 581 segmap_fault(
 582         struct hat *hat,
 583         struct seg *seg,
 584         caddr_t addr,
 585         size_t len,
 586         enum fault_type type,
 587         enum seg_rw rw)
 588 {
 589         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 590         struct smap *smp;
 591         page_t *pp, **ppp;
 592         struct vnode *vp;
 593         u_offset_t off;
 594         page_t *pl[MAXPPB + 1];
 595         uint_t prot;
 596         u_offset_t addroff;
 597         caddr_t adr;
 598         int err;
 599         u_offset_t sm_off;
 600         int hat_flag;
 601
 602         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 603                 int newpage;
 604                 kmutex_t *smtx;
 605
 606                 /*
 607                  * Pages are successfully prefaulted and locked in
 608                  * segmap_getmapflt and can't be unlocked until
 609                  * segmap_release. No hat mappings have to be locked
 610                  * and they also can't be unlocked as long as the
 611                  * caller owns an active kpm addr.
 612                  */
 613 #ifndef DEBUG
 614                 if (type != F_SOFTUNLOCK)
 615                         return (0);
 616 #endif
 617
 618                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 619                         panic("segmap_fault: smap not found "
 620                             "for addr %p", (void *)addr);
 621                         /*NOTREACHED*/
 622                 }
 623
 624                 smtx = SMAPMTX(smp);
 625 #ifdef  DEBUG
 626                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 627                 if (newpage) {
 628                         cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
 629                             (void *)smp);
 630                 }
 631
 632                 if (type != F_SOFTUNLOCK) {
 633                         mutex_exit(smtx);
 634                         return (0);
 635                 }
 636 #endif
 637                 mutex_exit(smtx);
 638                 vp = smp->sm_vp;
 639                 sm_off = smp->sm_off;
 640
 641                 if (vp == NULL)
 642                         return (FC_MAKE_ERR(EIO));
 643
 644                 ASSERT(smp->sm_refcnt > 0);
 645
 646                 addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 647                 if (addroff + len > MAXBSIZE)
 648                         panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
 649                             (void *)(addr + len));
 650
 651                 off = sm_off + addroff;
 652
 653                 pp = page_find(vp, off);
 654
 655                 if (pp == NULL)
 656                         panic("segmap_fault: softunlock page not found");
 657
 658                 /*
 659                  * Set ref bit also here in case of S_OTHER to avoid the
 660                  * overhead of supporting other cases than F_SOFTUNLOCK
 661                  * with segkpm. We can do this because the underlying
 662                  * pages are locked anyway.
 663                  */
 664                 if (rw == S_WRITE) {
 665                         hat_setrefmod(pp);
 666                 } else {
 667                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 668                             "segmap_fault:pp %p vp %p offset %llx",
 669                             pp, vp, off);
 670                         hat_setref(pp);
 671                 }
 672
 673                 return (0);
 674         }
 675
 676         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
 677         smp = GET_SMAP(seg, addr);
 678         vp = smp->sm_vp;
 679         sm_off = smp->sm_off;
 680
 681         if (vp == NULL)
 682                 return (FC_MAKE_ERR(EIO));
 683
 684         ASSERT(smp->sm_refcnt > 0);
 685
 686         addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
 687         if (addroff + len > MAXBSIZE) {
 688                 panic("segmap_fault: endaddr %p "
 689                     "exceeds MAXBSIZE chunk", (void *)(addr + len));
 690                 /*NOTREACHED*/
 691         }
 692         off = sm_off + addroff;
 693
 694         /*
 695          * First handle the easy stuff
 696          */
 697         if (type == F_SOFTUNLOCK) {
 698                 segmap_unlock(hat, seg, addr, len, rw, smp);
 699                 return (0);
 700         }
 701
 702         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 703             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 704         err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
 705             seg, addr, rw, CRED(), NULL);
 706
 707         if (err)
 708                 return (FC_MAKE_ERR(err));
 709
 710         prot &= smd->smd_prot;
 711
 712         /*
 713          * Handle all pages returned in the pl[] array.
 714          * This loop is coded on the assumption that if
 715          * there was no error from the VOP_GETPAGE routine,
 716          * that the page list returned will contain all the
 717          * needed pages for the vp from [off..off + len].
 718          */
 719         ppp = pl;
 720         while ((pp = *ppp++) != NULL) {
 721                 u_offset_t poff;
 722                 ASSERT(pp->p_vnode == vp);
 723                 hat_flag = HAT_LOAD;
 724
 725                 /*
 726                  * Verify that the pages returned are within the range
 727                  * of this segmap region.  Note that it is theoretically
 728                  * possible for pages outside this range to be returned,
 729                  * but it is not very likely.  If we cannot use the
 730                  * page here, just release it and go on to the next one.
 731                  */
 732                 if (pp->p_offset < sm_off ||
 733                     pp->p_offset >= sm_off + MAXBSIZE) {
 734                         (void) page_release(pp, 1);
 735                         continue;
 736                 }
 737
 738                 ASSERT(hat == kas.a_hat);
 739                 poff = pp->p_offset;
 740                 adr = addr + (poff - off);
 741                 if (adr >= addr && adr < addr + len) {
 742                         hat_setref(pp);
 743                         TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
 744                             "segmap_fault:pp %p vp %p offset %llx",
 745                             pp, vp, poff);
 746                         if (type == F_SOFTLOCK)
 747                                 hat_flag = HAT_LOAD_LOCK;
 748                 }
 749
 750                 /*
 751                  * Deal with VMODSORT pages here. If we know this is a write
 752                  * do the setmod now and allow write protection.
 753                  * As long as it's modified or not S_OTHER, remove write
 754                  * protection. With S_OTHER it's up to the FS to deal with this.
 755                  */
 756                 if (IS_VMODSORT(vp)) {
 757                         if (rw == S_WRITE)
 758                                 hat_setmod(pp);
 759                         else if (rw != S_OTHER && !hat_ismod(pp))
 760                                 prot &= ~PROT_WRITE;
 761                 }
 762
 763                 hat_memload(hat, adr, pp, prot, hat_flag);
 764                 if (hat_flag != HAT_LOAD_LOCK)
 765                         page_unlock(pp);
 766         }
 767         return (0);
 768 }
 769
 770 /*
 771  * This routine is used to start I/O on pages asynchronously.
 772  */
 773 static faultcode_t
 774 segmap_faulta(struct seg *seg, caddr_t addr)
 775 {
 776         struct smap *smp;
 777         struct vnode *vp;
 778         u_offset_t off;
 779         int err;
 780
 781         if (segmap_kpm && IS_KPM_ADDR(addr)) {
 782                 int     newpage;
 783                 kmutex_t *smtx;
 784
 785                 /*
 786                  * Pages are successfully prefaulted and locked in
 787                  * segmap_getmapflt and can't be unlocked until
 788                  * segmap_release. No hat mappings have to be locked
 789                  * and they also can't be unlocked as long as the
 790                  * caller owns an active kpm addr.
 791                  */
 792 #ifdef  DEBUG
 793                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
 794                         panic("segmap_faulta: smap not found "
 795                             "for addr %p", (void *)addr);
 796                         /*NOTREACHED*/
 797                 }
 798
 799                 smtx = SMAPMTX(smp);
 800                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
 801                 mutex_exit(smtx);
 802                 if (newpage)
 803                         cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
 804                             (void *)smp);
 805 #endif
 806                 return (0);
 807         }
 808
 809         segmapcnt.smp_faulta.value.ul++;
 810         smp = GET_SMAP(seg, addr);
 811
 812         ASSERT(smp->sm_refcnt > 0);
 813
 814         vp = smp->sm_vp;
 815         off = smp->sm_off;
 816
 817         if (vp == NULL) {
 818                 cmn_err(CE_WARN, "segmap_faulta - no vp");
 819                 return (FC_MAKE_ERR(EIO));
 820         }
 821
 822         TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
 823             "segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
 824
 825         err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
 826             & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
 827             seg, addr, S_READ, CRED(), NULL);
 828
 829         if (err)
 830                 return (FC_MAKE_ERR(err));
 831         return (0);
 832 }
 833
 834 /*ARGSUSED*/
 835 static int
 836 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
 837 {
 838         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 839
 840         ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
 841
 842         /*
 843          * Need not acquire the segment lock since
 844          * "smd_prot" is a read-only field.
 845          */
 846         return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
 847 }
 848
 849 static int
 850 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 851 {
 852         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 853         size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
 854
 855         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
 856
 857         if (pgno != 0) {
 858                 do {
 859                         protv[--pgno] = smd->smd_prot;
 860                 } while (pgno != 0);
 861         }
 862         return (0);
 863 }
 864
 865 static u_offset_t
 866 segmap_getoffset(struct seg *seg, caddr_t addr)
 867 {
 868         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 869
 870         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 871
 872         return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
 873 }
 874
 875 /*ARGSUSED*/
 876 static int
 877 segmap_gettype(struct seg *seg, caddr_t addr)
 878 {
 879         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 880
 881         return (MAP_SHARED);
 882 }
 883
 884 /*ARGSUSED*/
 885 static int
 886 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 887 {
 888         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
 889
 890         ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
 891
 892         /* XXX - This doesn't make any sense */
 893         *vpp = smd->smd_sm->sm_vp;
 894         return (0);
 895 }
 896
 897 /*
 898  * Check to see if it makes sense to do kluster/read ahead to
 899  * addr + delta relative to the mapping at addr.  We assume here
 900  * that delta is a signed PAGESIZE'd multiple (which can be negative).
 901  *
 902  * For segmap we always "approve" of this action from our standpoint.
 903  */
 904 /*ARGSUSED*/
 905 static int
 906 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 907 {
 908         return (0);
 909 }
 910
 911 static void
 912 segmap_badop()
 913 {
 914         panic("segmap_badop");
 915         /*NOTREACHED*/
 916 }
 917
 918 /*
 919  * Special private segmap operations
 920  */
 921
 922 /*
 923  * Add smap to the appropriate free list.
 924  */
 925 static void
 926 segmap_smapadd(struct smap *smp)
 927 {
 928         struct smfree *sm;
 929         struct smap *smpfreelist;
 930         struct sm_freeq *releq;
 931
 932         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 933
 934         if (smp->sm_refcnt != 0) {
 935                 panic("segmap_smapadd");
 936                 /*NOTREACHED*/
 937         }
 938
 939         sm = &smd_free[smp->sm_free_ndx];
 940         /*
 941          * Add to the tail of the release queue
 942          * Note that sm_releq and sm_allocq could toggle
 943          * before we get the lock. This does not affect
 944          * correctness as the 2 queues are only maintained
 945          * to reduce lock pressure.
 946          */
 947         releq = sm->sm_releq;
 948         if (releq == &sm->sm_freeq[0])
 949                 smp->sm_flags |= SM_QNDX_ZERO;
 950         else
 951                 smp->sm_flags &= ~SM_QNDX_ZERO;
 952         mutex_enter(&releq->smq_mtx);
 953         smpfreelist = releq->smq_free;
 954         if (smpfreelist == 0) {
 955                 int want;
 956
 957                 releq->smq_free = smp->sm_next = smp->sm_prev = smp;
 958                 /*
 959                  * Both queue mutexes held to set sm_want;
 960                  * snapshot the value before dropping releq mutex.
 961                  * If sm_want appears after the releq mutex is dropped,
 962                  * then the smap just freed is already gone.
 963                  */
 964                 want = sm->sm_want;
 965                 mutex_exit(&releq->smq_mtx);
 966                 /*
 967                  * See if there was a waiter before dropping the releq mutex
 968                  * then recheck after obtaining sm_freeq[0] mutex as
 969                  * the another thread may have already signaled.
 970                  */
 971                 if (want) {
 972                         mutex_enter(&sm->sm_freeq[0].smq_mtx);
 973                         if (sm->sm_want)
 974                                 cv_signal(&sm->sm_free_cv);
 975                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
 976                 }
 977         } else {
 978                 smp->sm_next = smpfreelist;
 979                 smp->sm_prev = smpfreelist->sm_prev;
 980                 smpfreelist->sm_prev = smp;
 981                 smp->sm_prev->sm_next = smp;
 982                 mutex_exit(&releq->smq_mtx);
 983         }
 984 }
 985
 986
 987 static struct smap *
 988 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
 989 {
 990         struct smap **hpp;
 991         struct smap *tmp;
 992         kmutex_t *hmtx;
 993
 994         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
 995         ASSERT(smp->sm_vp == NULL);
 996         ASSERT(smp->sm_hash == NULL);
 997         ASSERT(smp->sm_prev == NULL);
 998         ASSERT(smp->sm_next == NULL);
 999         ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
1000
1001         hmtx = SHASHMTX(hashid);
1002
1003         mutex_enter(hmtx);
1004         /*
1005          * First we need to verify that no one has created a smp
1006          * with (vp,off) as its tag before we us.
1007          */
1008         for (tmp = smd_hash[hashid].sh_hash_list;
1009             tmp != NULL; tmp = tmp->sm_hash)
1010                 if (tmp->sm_vp == vp && tmp->sm_off == off)
1011                         break;
1012
1013         if (tmp == NULL) {
1014                 /*
1015                  * No one created one yet.
1016                  *
1017                  * Funniness here - we don't increment the ref count on the
1018                  * vnode * even though we have another pointer to it here.
1019                  * The reason for this is that we don't want the fact that
1020                  * a seg_map entry somewhere refers to a vnode to prevent the
1021                  * vnode * itself from going away.  This is because this
1022                  * reference to the vnode is a "soft one".  In the case where
1023                  * a mapping is being used by a rdwr [or directory routine?]
1024                  * there already has to be a non-zero ref count on the vnode.
1025                  * In the case where the vp has been freed and the the smap
1026                  * structure is on the free list, there are no pages in memory
1027                  * that can refer to the vnode.  Thus even if we reuse the same
1028                  * vnode/smap structure for a vnode which has the same
1029                  * address but represents a different object, we are ok.
1030                  */
1031                 smp->sm_vp = vp;
1032                 smp->sm_off = off;
1033
1034                 hpp = &smd_hash[hashid].sh_hash_list;
1035                 smp->sm_hash = *hpp;
1036                 *hpp = smp;
1037 #ifdef SEGMAP_HASHSTATS
1038                 smd_hash_len[hashid]++;
1039 #endif
1040         }
1041         mutex_exit(hmtx);
1042
1043         return (tmp);
1044 }
1045
1046 static void
1047 segmap_hashout(struct smap *smp)
1048 {
1049         struct smap **hpp, *hp;
1050         struct vnode *vp;
1051         kmutex_t *mtx;
1052         int hashid;
1053         u_offset_t off;
1054
1055         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1056
1057         vp = smp->sm_vp;
1058         off = smp->sm_off;
1059
1060         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1061         mtx = SHASHMTX(hashid);
1062         mutex_enter(mtx);
1063
1064         hpp = &smd_hash[hashid].sh_hash_list;
1065         for (;;) {
1066                 hp = *hpp;
1067                 if (hp == NULL) {
1068                         panic("segmap_hashout");
1069                         /*NOTREACHED*/
1070                 }
1071                 if (hp == smp)
1072                         break;
1073                 hpp = &hp->sm_hash;
1074         }
1075
1076         *hpp = smp->sm_hash;
1077         smp->sm_hash = NULL;
1078 #ifdef SEGMAP_HASHSTATS
1079         smd_hash_len[hashid]--;
1080 #endif
1081         mutex_exit(mtx);
1082
1083         smp->sm_vp = NULL;
1084         smp->sm_off = (u_offset_t)0;
1085
1086 }
1087
1088 /*
1089  * Attempt to free unmodified, unmapped, and non locked segmap
1090  * pages.
1091  */
1092 void
1093 segmap_pagefree(struct vnode *vp, u_offset_t off)
1094 {
1095         u_offset_t pgoff;
1096         page_t  *pp;
1097
1098         for (pgoff = off; pgoff < off + MAXBSIZE; pgoff += PAGESIZE) {
1099
1100                 if ((pp = page_lookup_nowait(vp, pgoff, SE_EXCL)) == NULL)
1101                         continue;
1102
1103                 switch (page_release(pp, 1)) {
1104                 case PGREL_NOTREL:
1105                         segmapcnt.smp_free_notfree.value.ul++;
1106                         break;
1107                 case PGREL_MOD:
1108                         segmapcnt.smp_free_dirty.value.ul++;
1109                         break;
1110                 case PGREL_CLEAN:
1111                         segmapcnt.smp_free.value.ul++;
1112                         break;
1113                 }
1114         }
1115 }
1116
1117 /*
1118  * Locks held on entry: smap lock
1119  * Locks held on exit : smap lock.
1120  */
1121
1122 static void
1123 grab_smp(struct smap *smp, page_t *pp)
1124 {
1125         ASSERT(MUTEX_HELD(SMAPMTX(smp)));
1126         ASSERT(smp->sm_refcnt == 0);
1127
1128         if (smp->sm_vp != (struct vnode *)NULL) {
1129                 struct vnode    *vp = smp->sm_vp;
1130                 u_offset_t      off = smp->sm_off;
1131                 /*
1132                  * Destroy old vnode association and
1133                  * unload any hardware translations to
1134                  * the old object.
1135                  */
1136                 smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reuse++;
1137                 segmap_hashout(smp);
1138
1139                 /*
1140                  * This node is off freelist and hashlist,
1141                  * so there is no reason to drop/reacquire sm_mtx
1142                  * across calls to hat_unload.
1143                  */
1144                 if (segmap_kpm) {
1145                         caddr_t vaddr;
1146                         int hat_unload_needed = 0;
1147
1148                         /*
1149                          * unload kpm mapping
1150                          */
1151                         if (pp != NULL) {
1152                                 vaddr = hat_kpm_page2va(pp, 1);
1153                                 hat_kpm_mapout(pp, GET_KPME(smp), vaddr);
1154                                 page_unlock(pp);
1155                         }
1156
1157                         /*
1158                          * Check if we have (also) the rare case of a
1159                          * non kpm mapping.
1160                          */
1161                         if (smp->sm_flags & SM_NOTKPM_RELEASED) {
1162                                 hat_unload_needed = 1;
1163                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1164                         }
1165
1166                         if (hat_unload_needed) {
1167                                 hat_unload(kas.a_hat, segkmap->s_base +
1168                                     ((smp - smd_smap) * MAXBSIZE),
1169                                     MAXBSIZE, HAT_UNLOAD);
1170                         }
1171
1172                 } else {
1173                         ASSERT(smp->sm_flags & SM_NOTKPM_RELEASED);
1174                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
1175                         hat_unload(kas.a_hat, segkmap->s_base +
1176                             ((smp - smd_smap) * MAXBSIZE),
1177                             MAXBSIZE, HAT_UNLOAD);
1178                 }
1179                 segmap_pagefree(vp, off);
1180         }
1181 }
1182
1183 static struct smap *
1184 get_free_smp(int free_ndx)
1185 {
1186         struct smfree *sm;
1187         kmutex_t *smtx;
1188         struct smap *smp, *first;
1189         struct sm_freeq *allocq, *releq;
1190         struct kpme *kpme;
1191         page_t *pp = NULL;
1192         int end_ndx, page_locked = 0;
1193
1194         end_ndx = free_ndx;
1195         sm = &smd_free[free_ndx];
1196
1197 retry_queue:
1198         allocq = sm->sm_allocq;
1199         mutex_enter(&allocq->smq_mtx);
1200
1201         if ((smp = allocq->smq_free) == NULL) {
1202
1203 skip_queue:
1204                 /*
1205                  * The alloc list is empty or this queue is being skipped;
1206                  * first see if the allocq toggled.
1207                  */
1208                 if (sm->sm_allocq != allocq) {
1209                         /* queue changed */
1210                         mutex_exit(&allocq->smq_mtx);
1211                         goto retry_queue;
1212                 }
1213                 releq = sm->sm_releq;
1214                 if (!mutex_tryenter(&releq->smq_mtx)) {
1215                         /* cannot get releq; a free smp may be there now */
1216                         mutex_exit(&allocq->smq_mtx);
1217
1218                         /*
1219                          * This loop could spin forever if this thread has
1220                          * higher priority than the thread that is holding
1221                          * releq->smq_mtx. In order to force the other thread
1222                          * to run, we'll lock/unlock the mutex which is safe
1223                          * since we just unlocked the allocq mutex.
1224                          */
1225                         mutex_enter(&releq->smq_mtx);
1226                         mutex_exit(&releq->smq_mtx);
1227                         goto retry_queue;
1228                 }
1229                 if (releq->smq_free == NULL) {
1230                         /*
1231                          * This freelist is empty.
1232                          * This should not happen unless clients
1233                          * are failing to release the segmap
1234                          * window after accessing the data.
1235                          * Before resorting to sleeping, try
1236                          * the next list of the same color.
1237                          */
1238                         free_ndx = (free_ndx + smd_ncolor) & smd_freemsk;
1239                         if (free_ndx != end_ndx) {
1240                                 mutex_exit(&releq->smq_mtx);
1241                                 mutex_exit(&allocq->smq_mtx);
1242                                 sm = &smd_free[free_ndx];
1243                                 goto retry_queue;
1244                         }
1245                         /*
1246                          * Tried all freelists of the same color once,
1247                          * wait on this list and hope something gets freed.
1248                          */
1249                         segmapcnt.smp_get_nofree.value.ul++;
1250                         sm->sm_want++;
1251                         mutex_exit(&sm->sm_freeq[1].smq_mtx);
1252                         cv_wait(&sm->sm_free_cv,
1253                             &sm->sm_freeq[0].smq_mtx);
1254                         sm->sm_want--;
1255                         mutex_exit(&sm->sm_freeq[0].smq_mtx);
1256                         sm = &smd_free[free_ndx];
1257                         goto retry_queue;
1258                 } else {
1259                         /*
1260                          * Something on the rele queue; flip the alloc
1261                          * and rele queues and retry.
1262                          */
1263                         sm->sm_allocq = releq;
1264                         sm->sm_releq = allocq;
1265                         mutex_exit(&allocq->smq_mtx);
1266                         mutex_exit(&releq->smq_mtx);
1267                         if (page_locked) {
1268                                 delay(hz >> 2);
1269                                 page_locked = 0;
1270                         }
1271                         goto retry_queue;
1272                 }
1273         } else {
1274                 /*
1275                  * Fastpath the case we get the smap mutex
1276                  * on the first try.
1277                  */
1278                 first = smp;
1279 next_smap:
1280                 smtx = SMAPMTX(smp);
1281                 if (!mutex_tryenter(smtx)) {
1282                         /*
1283                          * Another thread is trying to reclaim this slot.
1284                          * Skip to the next queue or smap.
1285                          */
1286                         if ((smp = smp->sm_next) == first) {
1287                                 goto skip_queue;
1288                         } else {
1289                                 goto next_smap;
1290                         }
1291                 } else {
1292                         /*
1293                          * if kpme exists, get shared lock on the page
1294                          */
1295                         if (segmap_kpm && smp->sm_vp != NULL) {
1296
1297                                 kpme = GET_KPME(smp);
1298                                 pp = kpme->kpe_page;
1299
1300                                 if (pp != NULL) {
1301                                         if (!page_trylock(pp, SE_SHARED)) {
1302                                                 smp = smp->sm_next;
1303                                                 mutex_exit(smtx);
1304                                                 page_locked = 1;
1305
1306                                                 pp = NULL;
1307
1308                                                 if (smp == first) {
1309                                                         goto skip_queue;
1310                                                 } else {
1311                                                         goto next_smap;
1312                                                 }
1313                                         } else {
1314                                                 if (kpme->kpe_page == NULL) {
1315                                                         page_unlock(pp);
1316                                                         pp = NULL;
1317                                                 }
1318                                         }
1319                                 }
1320                         }
1321
1322                         /*
1323                          * At this point, we've selected smp.  Remove smp
1324                          * from its freelist.  If smp is the first one in
1325                          * the freelist, update the head of the freelist.
1326                          */
1327                         if (first == smp) {
1328                                 ASSERT(first == allocq->smq_free);
1329                                 allocq->smq_free = smp->sm_next;
1330                         }
1331
1332                         /*
1333                          * if the head of the freelist still points to smp,
1334                          * then there are no more free smaps in that list.
1335                          */
1336                         if (allocq->smq_free == smp)
1337                                 /*
1338                                  * Took the last one
1339                                  */
1340                                 allocq->smq_free = NULL;
1341                         else {
1342                                 smp->sm_prev->sm_next = smp->sm_next;
1343                                 smp->sm_next->sm_prev = smp->sm_prev;
1344                         }
1345                         mutex_exit(&allocq->smq_mtx);
1346                         smp->sm_prev = smp->sm_next = NULL;
1347
1348                         /*
1349                          * if pp != NULL, pp must have been locked;
1350                          * grab_smp() unlocks pp.
1351                          */
1352                         ASSERT((pp == NULL) || PAGE_LOCKED(pp));
1353                         grab_smp(smp, pp);
1354                         /* return smp locked. */
1355                         ASSERT(SMAPMTX(smp) == smtx);
1356                         ASSERT(MUTEX_HELD(smtx));
1357                         return (smp);
1358                 }
1359         }
1360 }
1361
1362 /*
1363  * Special public segmap operations
1364  */
1365
1366 /*
1367  * Create pages (without using VOP_GETPAGE) and load up translations to them.
1368  * If softlock is TRUE, then set things up so that it looks like a call
1369  * to segmap_fault with F_SOFTLOCK.
1370  *
1371  * Returns 1, if a page is created by calling page_create_va(), or 0 otherwise.
1372  *
1373  * All fields in the generic segment (struct seg) are considered to be
1374  * read-only for "segmap" even though the kernel address space (kas) may
1375  * not be locked, hence no lock is needed to access them.
1376  */
1377 int
1378 segmap_pagecreate(struct seg *seg, caddr_t addr, size_t len, int softlock)
1379 {
1380         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
1381         page_t *pp;
1382         u_offset_t off;
1383         struct smap *smp;
1384         struct vnode *vp;
1385         caddr_t eaddr;
1386         int newpage = 0;
1387         uint_t prot;
1388         kmutex_t *smtx;
1389         int hat_flag;
1390
1391         ASSERT(seg->s_as == &kas);
1392
1393         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1394                 /*
1395                  * Pages are successfully prefaulted and locked in
1396                  * segmap_getmapflt and can't be unlocked until
1397                  * segmap_release. The SM_KPM_NEWPAGE flag is set
1398                  * in segmap_pagecreate_kpm when new pages are created.
1399                  * and it is returned as "newpage" indication here.
1400                  */
1401                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1402                         panic("segmap_pagecreate: smap not found "
1403                             "for addr %p", (void *)addr);
1404                         /*NOTREACHED*/
1405                 }
1406
1407                 smtx = SMAPMTX(smp);
1408                 newpage = smp->sm_flags & SM_KPM_NEWPAGE;
1409                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
1410                 mutex_exit(smtx);
1411
1412                 return (newpage);
1413         }
1414
1415         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
1416
1417         eaddr = addr + len;
1418         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1419
1420         smp = GET_SMAP(seg, addr);
1421
1422         /*
1423          * We don't grab smp mutex here since we assume the smp
1424          * has a refcnt set already which prevents the slot from
1425          * changing its id.
1426          */
1427         ASSERT(smp->sm_refcnt > 0);
1428
1429         vp = smp->sm_vp;
1430         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1431         prot = smd->smd_prot;
1432
1433         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1434                 hat_flag = HAT_LOAD;
1435                 pp = page_lookup(vp, off, SE_SHARED);
1436                 if (pp == NULL) {
1437                         ushort_t bitindex;
1438
1439                         if ((pp = page_create_va(vp, off,
1440                             PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
1441                                 panic("segmap_pagecreate: page_create failed");
1442                                 /*NOTREACHED*/
1443                         }
1444                         newpage = 1;
1445                         page_io_unlock(pp);
1446
1447                         /*
1448                          * Since pages created here do not contain valid
1449                          * data until the caller writes into them, the
1450                          * "exclusive" lock will not be dropped to prevent
1451                          * other users from accessing the page.  We also
1452                          * have to lock the translation to prevent a fault
1453                          * from occurring when the virtual address mapped by
1454                          * this page is written into.  This is necessary to
1455                          * avoid a deadlock since we haven't dropped the
1456                          * "exclusive" lock.
1457                          */
1458                         bitindex = (ushort_t)((off - smp->sm_off) >> PAGESHIFT);
1459
1460                         /*
1461                          * Large Files: The following assertion is to
1462                          * verify the cast above.
1463                          */
1464                         ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1465                         smtx = SMAPMTX(smp);
1466                         mutex_enter(smtx);
1467                         smp->sm_bitmap |= SMAP_BIT_MASK(bitindex);
1468                         mutex_exit(smtx);
1469
1470                         hat_flag = HAT_LOAD_LOCK;
1471                 } else if (softlock) {
1472                         hat_flag = HAT_LOAD_LOCK;
1473                 }
1474
1475                 if (IS_VMODSORT(pp->p_vnode) && (prot & PROT_WRITE))
1476                         hat_setmod(pp);
1477
1478                 hat_memload(kas.a_hat, addr, pp, prot, hat_flag);
1479
1480                 if (hat_flag != HAT_LOAD_LOCK)
1481                         page_unlock(pp);
1482
1483                 TRACE_5(TR_FAC_VM, TR_SEGMAP_PAGECREATE,
1484                     "segmap_pagecreate:seg %p addr %p pp %p vp %p offset %llx",
1485                     seg, addr, pp, vp, off);
1486         }
1487
1488         return (newpage);
1489 }
1490
1491 void
1492 segmap_pageunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
1493 {
1494         struct smap     *smp;
1495         ushort_t        bitmask;
1496         page_t          *pp;
1497         struct  vnode   *vp;
1498         u_offset_t      off;
1499         caddr_t         eaddr;
1500         kmutex_t        *smtx;
1501
1502         ASSERT(seg->s_as == &kas);
1503
1504         eaddr = addr + len;
1505         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1506
1507         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1508                 /*
1509                  * Pages are successfully prefaulted and locked in
1510                  * segmap_getmapflt and can't be unlocked until
1511                  * segmap_release, so no pages or hat mappings have
1512                  * to be unlocked at this point.
1513                  */
1514 #ifdef DEBUG
1515                 if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
1516                         panic("segmap_pageunlock: smap not found "
1517                             "for addr %p", (void *)addr);
1518                         /*NOTREACHED*/
1519                 }
1520
1521                 ASSERT(smp->sm_refcnt > 0);
1522                 mutex_exit(SMAPMTX(smp));
1523 #endif
1524                 return;
1525         }
1526
1527         smp = GET_SMAP(seg, addr);
1528         smtx = SMAPMTX(smp);
1529
1530         ASSERT(smp->sm_refcnt > 0);
1531
1532         vp = smp->sm_vp;
1533         off = smp->sm_off + ((u_offset_t)((uintptr_t)addr & MAXBOFFSET));
1534
1535         for (; addr < eaddr; addr += PAGESIZE, off += PAGESIZE) {
1536                 bitmask = SMAP_BIT_MASK((int)(off - smp->sm_off) >> PAGESHIFT);
1537
1538                 /*
1539                  * Large Files: Following assertion is to verify
1540                  * the correctness of the cast to (int) above.
1541                  */
1542                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
1543
1544                 /*
1545                  * If the bit corresponding to "off" is set,
1546                  * clear this bit in the bitmap, unlock translations,
1547                  * and release the "exclusive" lock on the page.
1548                  */
1549                 if (smp->sm_bitmap & bitmask) {
1550                         mutex_enter(smtx);
1551                         smp->sm_bitmap &= ~bitmask;
1552                         mutex_exit(smtx);
1553
1554                         hat_unlock(kas.a_hat, addr, PAGESIZE);
1555
1556                         /*
1557                          * Use page_find() instead of page_lookup() to
1558                          * find the page since we know that it has
1559                          * "exclusive" lock.
1560                          */
1561                         pp = page_find(vp, off);
1562                         if (pp == NULL) {
1563                                 panic("segmap_pageunlock: page not found");
1564                                 /*NOTREACHED*/
1565                         }
1566                         if (rw == S_WRITE) {
1567                                 hat_setrefmod(pp);
1568                         } else if (rw != S_OTHER) {
1569                                 hat_setref(pp);
1570                         }
1571
1572                         page_unlock(pp);
1573                 }
1574         }
1575 }
1576
1577 caddr_t
1578 segmap_getmap(struct seg *seg, struct vnode *vp, u_offset_t off)
1579 {
1580         return (segmap_getmapflt(seg, vp, off, MAXBSIZE, 0, S_OTHER));
1581 }
1582
1583 /*
1584  * This is the magic virtual address that offset 0 of an ELF
1585  * file gets mapped to in user space. This is used to pick
1586  * the vac color on the freelist.
1587  */
1588 #define ELF_OFFZERO_VA  (0x10000)
1589 /*
1590  * segmap_getmap allocates a MAXBSIZE big slot to map the vnode vp
1591  * in the range <off, off + len). off doesn't need to be MAXBSIZE aligned.
1592  * The return address is  always MAXBSIZE aligned.
1593  *
1594  * If forcefault is nonzero and the MMU translations haven't yet been created,
1595  * segmap_getmap will call segmap_fault(..., F_INVAL, rw) to create them.
1596  */
1597 caddr_t
1598 segmap_getmapflt(
1599         struct seg *seg,
1600         struct vnode *vp,
1601         u_offset_t off,
1602         size_t len,
1603         int forcefault,
1604         enum seg_rw rw)
1605 {
1606         struct smap *smp, *nsmp;
1607         extern struct vnode *common_specvp();
1608         caddr_t baseaddr;                       /* MAXBSIZE aligned */
1609         u_offset_t baseoff;
1610         int newslot;
1611         caddr_t vaddr;
1612         int color, hashid;
1613         kmutex_t *hashmtx, *smapmtx;
1614         struct smfree *sm;
1615         page_t  *pp;
1616         struct kpme *kpme;
1617         uint_t  prot;
1618         caddr_t base;
1619         page_t  *pl[MAXPPB + 1];
1620         int     error;
1621         int     is_kpm = 1;
1622
1623         ASSERT(seg->s_as == &kas);
1624         ASSERT(seg == segkmap);
1625
1626         baseoff = off & (offset_t)MAXBMASK;
1627         if (off + len > baseoff + MAXBSIZE) {
1628                 panic("segmap_getmap bad len");
1629                 /*NOTREACHED*/
1630         }
1631
1632         /*
1633          * If this is a block device we have to be sure to use the
1634          * "common" block device vnode for the mapping.
1635          */
1636         if (vp->v_type == VBLK)
1637                 vp = common_specvp(vp);
1638
1639         smd_cpu[CPU->cpu_seqid].scpu.scpu_getmap++;
1640
1641         if (segmap_kpm == 0 ||
1642             (forcefault == SM_PAGECREATE && rw != S_WRITE)) {
1643                 is_kpm = 0;
1644         }
1645
1646         SMAP_HASHFUNC(vp, off, hashid); /* macro assigns hashid */
1647         hashmtx = SHASHMTX(hashid);
1648
1649 retry_hash:
1650         mutex_enter(hashmtx);
1651         for (smp = smd_hash[hashid].sh_hash_list;
1652             smp != NULL; smp = smp->sm_hash)
1653                 if (smp->sm_vp == vp && smp->sm_off == baseoff)
1654                         break;
1655         mutex_exit(hashmtx);
1656
1657 vrfy_smp:
1658         if (smp != NULL) {
1659
1660                 ASSERT(vp->v_count != 0);
1661
1662                 /*
1663                  * Get smap lock and recheck its tag. The hash lock
1664                  * is dropped since the hash is based on (vp, off)
1665                  * and (vp, off) won't change when we have smap mtx.
1666                  */
1667                 smapmtx = SMAPMTX(smp);
1668                 mutex_enter(smapmtx);
1669                 if (smp->sm_vp != vp || smp->sm_off != baseoff) {
1670                         mutex_exit(smapmtx);
1671                         goto retry_hash;
1672                 }
1673
1674                 if (smp->sm_refcnt == 0) {
1675
1676                         smd_cpu[CPU->cpu_seqid].scpu.scpu_get_reclaim++;
1677
1678                         /*
1679                          * Could still be on the free list. However, this
1680                          * could also be an smp that is transitioning from
1681                          * the free list when we have too much contention
1682                          * for the smapmtx's. In this case, we have an
1683                          * unlocked smp that is not on the free list any
1684                          * longer, but still has a 0 refcnt.  The only way
1685                          * to be sure is to check the freelist pointers.
1686                          * Since we now have the smapmtx, we are guaranteed
1687                          * that the (vp, off) won't change, so we are safe
1688                          * to reclaim it.  get_free_smp() knows that this
1689                          * can happen, and it will check the refcnt.
1690                          */
1691
1692                         if ((smp->sm_next != NULL)) {
1693                                 struct sm_freeq *freeq;
1694
1695                                 ASSERT(smp->sm_prev != NULL);
1696                                 sm = &smd_free[smp->sm_free_ndx];
1697
1698                                 if (smp->sm_flags & SM_QNDX_ZERO)
1699                                         freeq = &sm->sm_freeq[0];
1700                                 else
1701                                         freeq = &sm->sm_freeq[1];
1702
1703                                 mutex_enter(&freeq->smq_mtx);
1704                                 if (freeq->smq_free != smp) {
1705                                         /*
1706                                          * fastpath normal case
1707                                          */
1708                                         smp->sm_prev->sm_next = smp->sm_next;
1709                                         smp->sm_next->sm_prev = smp->sm_prev;
1710                                 } else if (smp == smp->sm_next) {
1711                                         /*
1712                                          * Taking the last smap on freelist
1713                                          */
1714                                         freeq->smq_free = NULL;
1715                                 } else {
1716                                         /*
1717                                          * Reclaiming 1st smap on list
1718                                          */
1719                                         freeq->smq_free = smp->sm_next;
1720                                         smp->sm_prev->sm_next = smp->sm_next;
1721                                         smp->sm_next->sm_prev = smp->sm_prev;
1722                                 }
1723                                 mutex_exit(&freeq->smq_mtx);
1724                                 smp->sm_prev = smp->sm_next = NULL;
1725                         } else {
1726                                 ASSERT(smp->sm_prev == NULL);
1727                                 segmapcnt.smp_stolen.value.ul++;
1728                         }
1729
1730                 } else {
1731                         segmapcnt.smp_get_use.value.ul++;
1732                 }
1733                 smp->sm_refcnt++;               /* another user */
1734
1735                 /*
1736                  * We don't invoke segmap_fault via TLB miss, so we set ref
1737                  * and mod bits in advance. For S_OTHER  we set them in
1738                  * segmap_fault F_SOFTUNLOCK.
1739                  */
1740                 if (is_kpm) {
1741                         if (rw == S_WRITE) {
1742                                 smp->sm_flags |= SM_WRITE_DATA;
1743                         } else if (rw == S_READ) {
1744                                 smp->sm_flags |= SM_READ_DATA;
1745                         }
1746                 }
1747                 mutex_exit(smapmtx);
1748
1749                 newslot = 0;
1750         } else {
1751
1752                 uint32_t free_ndx, *free_ndxp;
1753                 union segmap_cpu *scpu;
1754
1755                 /*
1756                  * On a PAC machine or a machine with anti-alias
1757                  * hardware, smd_colormsk will be zero.
1758                  *
1759                  * On a VAC machine- pick color by offset in the file
1760                  * so we won't get VAC conflicts on elf files.
1761                  * On data files, color does not matter but we
1762                  * don't know what kind of file it is so we always
1763                  * pick color by offset. This causes color
1764                  * corresponding to file offset zero to be used more
1765                  * heavily.
1766                  */
1767                 color = (baseoff >> MAXBSHIFT) & smd_colormsk;
1768                 scpu = smd_cpu+CPU->cpu_seqid;
1769                 free_ndxp = &scpu->scpu.scpu_free_ndx[color];
1770                 free_ndx = (*free_ndxp += smd_ncolor) & smd_freemsk;
1771 #ifdef DEBUG
1772                 colors_used[free_ndx]++;
1773 #endif /* DEBUG */
1774
1775                 /*
1776                  * Get a locked smp slot from the free list.
1777                  */
1778                 smp = get_free_smp(free_ndx);
1779                 smapmtx = SMAPMTX(smp);
1780
1781                 ASSERT(smp->sm_vp == NULL);
1782
1783                 if ((nsmp = segmap_hashin(smp, vp, baseoff, hashid)) != NULL) {
1784                         /*
1785                          * Failed to hashin, there exists one now.
1786                          * Return the smp we just allocated.
1787                          */
1788                         segmap_smapadd(smp);
1789                         mutex_exit(smapmtx);
1790
1791                         smp = nsmp;
1792                         goto vrfy_smp;
1793                 }
1794                 smp->sm_refcnt++;               /* another user */
1795
1796                 /*
1797                  * We don't invoke segmap_fault via TLB miss, so we set ref
1798                  * and mod bits in advance. For S_OTHER  we set them in
1799                  * segmap_fault F_SOFTUNLOCK.
1800                  */
1801                 if (is_kpm) {
1802                         if (rw == S_WRITE) {
1803                                 smp->sm_flags |= SM_WRITE_DATA;
1804                         } else if (rw == S_READ) {
1805                                 smp->sm_flags |= SM_READ_DATA;
1806                         }
1807                 }
1808                 mutex_exit(smapmtx);
1809
1810                 newslot = 1;
1811         }
1812
1813         if (!is_kpm)
1814                 goto use_segmap_range;
1815
1816         /*
1817          * Use segkpm
1818          */
1819         /* Lint directive required until 6746211 is fixed */
1820         /*CONSTCOND*/
1821         ASSERT(PAGESIZE == MAXBSIZE);
1822
1823         /*
1824          * remember the last smp faulted on this cpu.
1825          */
1826         (smd_cpu+CPU->cpu_seqid)->scpu.scpu_last_smap = smp;
1827
1828         if (forcefault == SM_PAGECREATE) {
1829                 baseaddr = segmap_pagecreate_kpm(seg, vp, baseoff, smp, rw);
1830                 return (baseaddr);
1831         }
1832
1833         if (newslot == 0 &&
1834             (pp = GET_KPME(smp)->kpe_page) != NULL) {
1835
1836                 /* fastpath */
1837                 switch (rw) {
1838                 case S_READ:
1839                 case S_WRITE:
1840                         if (page_trylock(pp, SE_SHARED)) {
1841                                 if (PP_ISFREE(pp) ||
1842                                     !(pp->p_vnode == vp &&
1843                                     pp->p_offset == baseoff)) {
1844                                         page_unlock(pp);
1845                                         pp = page_lookup(vp, baseoff,
1846                                             SE_SHARED);
1847                                 }
1848                         } else {
1849                                 pp = page_lookup(vp, baseoff, SE_SHARED);
1850                         }
1851
1852                         if (pp == NULL) {
1853                                 ASSERT(GET_KPME(smp)->kpe_page == NULL);
1854                                 break;
1855                         }
1856
1857                         if (rw == S_WRITE &&
1858                             hat_page_getattr(pp, P_MOD | P_REF) !=
1859                             (P_MOD | P_REF)) {
1860                                 page_unlock(pp);
1861                                 break;
1862                         }
1863
1864                         /*
1865                          * We have the p_selock as reader, grab_smp
1866                          * can't hit us, we have bumped the smap
1867                          * refcnt and hat_pageunload needs the
1868                          * p_selock exclusive.
1869                          */
1870                         kpme = GET_KPME(smp);
1871                         if (kpme->kpe_page == pp) {
1872                                 baseaddr = hat_kpm_page2va(pp, 0);
1873                         } else if (kpme->kpe_page == NULL) {
1874                                 baseaddr = hat_kpm_mapin(pp, kpme);
1875                         } else {
1876                                 panic("segmap_getmapflt: stale "
1877                                     "kpme page, kpme %p", (void *)kpme);
1878                                 /*NOTREACHED*/
1879                         }
1880
1881                         /*
1882                          * We don't invoke segmap_fault via TLB miss,
1883                          * so we set ref and mod bits in advance.
1884                          * For S_OTHER and we set them in segmap_fault
1885                          * F_SOFTUNLOCK.
1886                          */
1887                         if (rw == S_READ && !hat_isref(pp))
1888                                 hat_setref(pp);
1889
1890                         return (baseaddr);
1891                 default:
1892                         break;
1893                 }
1894         }
1895
1896         base = segkpm_create_va(baseoff);
1897         error = VOP_GETPAGE(vp, (offset_t)baseoff, len, &prot, pl, MAXBSIZE,
1898             seg, base, rw, CRED(), NULL);
1899
1900         pp = pl[0];
1901         if (error || pp == NULL) {
1902                 /*
1903                  * Use segmap address slot and let segmap_fault deal
1904                  * with the error cases. There is no error return
1905                  * possible here.
1906                  */
1907                 goto use_segmap_range;
1908         }
1909
1910         ASSERT(pl[1] == NULL);
1911
1912         /*
1913          * When prot is not returned w/ PROT_ALL the returned pages
1914          * are not backed by fs blocks. For most of the segmap users
1915          * this is no problem, they don't write to the pages in the
1916          * same request and therefore don't rely on a following
1917          * trap driven segmap_fault. With SM_LOCKPROTO users it
1918          * is more secure to use segkmap adresses to allow
1919          * protection segmap_fault's.
1920          */
1921         if (prot != PROT_ALL && forcefault == SM_LOCKPROTO) {
1922                 /*
1923                  * Use segmap address slot and let segmap_fault
1924                  * do the error return.
1925                  */
1926                 ASSERT(rw != S_WRITE);
1927                 ASSERT(PAGE_LOCKED(pp));
1928                 page_unlock(pp);
1929                 forcefault = 0;
1930                 goto use_segmap_range;
1931         }
1932
1933         /*
1934          * We have the p_selock as reader, grab_smp can't hit us, we
1935          * have bumped the smap refcnt and hat_pageunload needs the
1936          * p_selock exclusive.
1937          */
1938         kpme = GET_KPME(smp);
1939         if (kpme->kpe_page == pp) {
1940                 baseaddr = hat_kpm_page2va(pp, 0);
1941         } else if (kpme->kpe_page == NULL) {
1942                 baseaddr = hat_kpm_mapin(pp, kpme);
1943         } else {
1944                 panic("segmap_getmapflt: stale kpme page after "
1945                     "VOP_GETPAGE, kpme %p", (void *)kpme);
1946                 /*NOTREACHED*/
1947         }
1948
1949         smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
1950
1951         return (baseaddr);
1952
1953
1954 use_segmap_range:
1955         baseaddr = seg->s_base + ((smp - smd_smap) * MAXBSIZE);
1956         TRACE_4(TR_FAC_VM, TR_SEGMAP_GETMAP,
1957             "segmap_getmap:seg %p addr %p vp %p offset %llx",
1958             seg, baseaddr, vp, baseoff);
1959
1960         /*
1961          * Prefault the translations
1962          */
1963         vaddr = baseaddr + (off - baseoff);
1964         if (forcefault && (newslot || !hat_probe(kas.a_hat, vaddr))) {
1965
1966                 caddr_t pgaddr = (caddr_t)((uintptr_t)vaddr &
1967                     (uintptr_t)PAGEMASK);
1968
1969                 (void) segmap_fault(kas.a_hat, seg, pgaddr,
1970                     (vaddr + len - pgaddr + PAGESIZE - 1) & (uintptr_t)PAGEMASK,
1971                     F_INVAL, rw);
1972         }
1973
1974         return (baseaddr);
1975 }
1976
1977 int
1978 segmap_release(struct seg *seg, caddr_t addr, uint_t flags)
1979 {
1980         struct smap     *smp;
1981         int             error;
1982         int             bflags = 0;
1983         struct vnode    *vp;
1984         u_offset_t      offset;
1985         kmutex_t        *smtx;
1986         int             is_kpm = 0;
1987         page_t          *pp;
1988
1989         if (segmap_kpm && IS_KPM_ADDR(addr)) {
1990
1991                 if (((uintptr_t)addr & MAXBOFFSET) != 0) {
1992                         panic("segmap_release: addr %p not "
1993                             "MAXBSIZE aligned", (void *)addr);
1994                         /*NOTREACHED*/
1995                 }
1996
1997                 if ((smp = get_smap_kpm(addr, &pp)) == NULL) {
1998                         panic("segmap_release: smap not found "
1999                             "for addr %p", (void *)addr);
2000                         /*NOTREACHED*/
2001                 }
2002
2003                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2004                     "segmap_relmap:seg %p addr %p smp %p",
2005                     seg, addr, smp);
2006
2007                 smtx = SMAPMTX(smp);
2008
2009                 /*
2010                  * For compatibility reasons segmap_pagecreate_kpm sets this
2011                  * flag to allow a following segmap_pagecreate to return
2012                  * this as "newpage" flag. When segmap_pagecreate is not
2013                  * called at all we clear it now.
2014                  */
2015                 smp->sm_flags &= ~SM_KPM_NEWPAGE;
2016                 is_kpm = 1;
2017                 if (smp->sm_flags & SM_WRITE_DATA) {
2018                         hat_setrefmod(pp);
2019                 } else if (smp->sm_flags & SM_READ_DATA) {
2020                         hat_setref(pp);
2021                 }
2022         } else {
2023                 if (addr < seg->s_base || addr >= seg->s_base + seg->s_size ||
2024                     ((uintptr_t)addr & MAXBOFFSET) != 0) {
2025                         panic("segmap_release: bad addr %p", (void *)addr);
2026                         /*NOTREACHED*/
2027                 }
2028                 smp = GET_SMAP(seg, addr);
2029
2030                 TRACE_3(TR_FAC_VM, TR_SEGMAP_RELMAP,
2031                     "segmap_relmap:seg %p addr %p smp %p",
2032                     seg, addr, smp);
2033
2034                 smtx = SMAPMTX(smp);
2035                 mutex_enter(smtx);
2036                 smp->sm_flags |= SM_NOTKPM_RELEASED;
2037         }
2038
2039         ASSERT(smp->sm_refcnt > 0);
2040
2041         /*
2042          * Need to call VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2043          * are set.
2044          */
2045         if ((flags & ~SM_DONTNEED) != 0) {
2046                 if (flags & SM_WRITE)
2047                         segmapcnt.smp_rel_write.value.ul++;
2048                 if (flags & SM_ASYNC) {
2049                         bflags |= B_ASYNC;
2050                         segmapcnt.smp_rel_async.value.ul++;
2051                 }
2052                 if (flags & SM_INVAL) {
2053                         bflags |= B_INVAL;
2054                         segmapcnt.smp_rel_abort.value.ul++;
2055                 }
2056                 if (flags & SM_DESTROY) {
2057                         bflags |= (B_INVAL|B_TRUNC);
2058                         segmapcnt.smp_rel_abort.value.ul++;
2059                 }
2060                 if (smp->sm_refcnt == 1) {
2061                         /*
2062                          * We only bother doing the FREE and DONTNEED flags
2063                          * if no one else is still referencing this mapping.
2064                          */
2065                         if (flags & SM_FREE) {
2066                                 bflags |= B_FREE;
2067                                 segmapcnt.smp_rel_free.value.ul++;
2068                         }
2069                         if (flags & SM_DONTNEED) {
2070                                 bflags |= B_DONTNEED;
2071                                 segmapcnt.smp_rel_dontneed.value.ul++;
2072                         }
2073                 }
2074         } else {
2075                 smd_cpu[CPU->cpu_seqid].scpu.scpu_release++;
2076         }
2077
2078         vp = smp->sm_vp;
2079         offset = smp->sm_off;
2080
2081         if (--smp->sm_refcnt == 0) {
2082
2083                 smp->sm_flags &= ~(SM_WRITE_DATA | SM_READ_DATA);
2084
2085                 if (flags & (SM_INVAL|SM_DESTROY)) {
2086                         segmap_hashout(smp);    /* remove map info */
2087                         if (is_kpm) {
2088                                 hat_kpm_mapout(pp, GET_KPME(smp), addr);
2089                                 if (smp->sm_flags & SM_NOTKPM_RELEASED) {
2090                                         smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2091                                         hat_unload(kas.a_hat, segkmap->s_base +
2092                                             ((smp - smd_smap) * MAXBSIZE),
2093                                             MAXBSIZE, HAT_UNLOAD);
2094                                 }
2095
2096                         } else {
2097                                 if (segmap_kpm)
2098                                         segkpm_mapout_validkpme(GET_KPME(smp));
2099
2100                                 smp->sm_flags &= ~SM_NOTKPM_RELEASED;
2101                                 hat_unload(kas.a_hat, addr, MAXBSIZE,
2102                                     HAT_UNLOAD);
2103                         }
2104                 }
2105                 segmap_smapadd(smp);    /* add to free list */
2106         }
2107
2108         mutex_exit(smtx);
2109
2110         if (is_kpm)
2111                 page_unlock(pp);
2112         /*
2113          * Now invoke VOP_PUTPAGE() if any flags (except SM_DONTNEED)
2114          * are set.
2115          */
2116         if ((flags & ~SM_DONTNEED) != 0) {
2117                 error = VOP_PUTPAGE(vp, offset, MAXBSIZE,
2118                     bflags, CRED(), NULL);
2119         } else {
2120                 error = 0;
2121         }
2122
2123         return (error);
2124 }
2125
2126 /*
2127  * Dump the pages belonging to this segmap segment.
2128  */
2129 static void
2130 segmap_dump(struct seg *seg)
2131 {
2132         struct segmap_data *smd;
2133         struct smap *smp, *smp_end;
2134         page_t *pp;
2135         pfn_t pfn;
2136         u_offset_t off;
2137         caddr_t addr;
2138
2139         smd = (struct segmap_data *)seg->s_data;
2140         addr = seg->s_base;
2141         for (smp = smd->smd_sm, smp_end = smp + smd->smd_npages;
2142             smp < smp_end; smp++) {
2143
2144                 if (smp->sm_refcnt) {
2145                         for (off = 0; off < MAXBSIZE; off += PAGESIZE) {
2146                                 int we_own_it = 0;
2147
2148                                 /*
2149                                  * If pp == NULL, the page either does
2150                                  * not exist or is exclusively locked.
2151                                  * So determine if it exists before
2152                                  * searching for it.
2153                                  */
2154                                 if ((pp = page_lookup_nowait(smp->sm_vp,
2155                                     smp->sm_off + off, SE_SHARED)))
2156                                         we_own_it = 1;
2157                                 else
2158                                         pp = page_exists(smp->sm_vp,
2159                                             smp->sm_off + off);
2160
2161                                 if (pp) {
2162                                         pfn = page_pptonum(pp);
2163                                         dump_addpage(seg->s_as,
2164                                             addr + off, pfn);
2165                                         if (we_own_it)
2166                                                 page_unlock(pp);
2167                                 }
2168                                 dump_timeleft = dump_timeout;
2169                         }
2170                 }
2171                 addr += MAXBSIZE;
2172         }
2173 }
2174
2175 /*ARGSUSED*/
2176 static int
2177 segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
2178     struct page ***ppp, enum lock_type type, enum seg_rw rw)
2179 {
2180         return (ENOTSUP);
2181 }
2182
2183 static int
2184 segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
2185 {
2186         struct segmap_data *smd = (struct segmap_data *)seg->s_data;
2187
2188         memidp->val[0] = (uintptr_t)smd->smd_sm->sm_vp;
2189         memidp->val[1] = smd->smd_sm->sm_off + (uintptr_t)(addr - seg->s_base);
2190         return (0);
2191 }
2192
2193 /*ARGSUSED*/
2194 static lgrp_mem_policy_info_t *
2195 segmap_getpolicy(struct seg *seg, caddr_t addr)
2196 {
2197         return (NULL);
2198 }
2199
2200 /*ARGSUSED*/
2201 static int
2202 segmap_capable(struct seg *seg, segcapability_t capability)
2203 {
2204         return (0);
2205 }
2206
2207
2208 #ifdef  SEGKPM_SUPPORT
2209
2210 /*
2211  * segkpm support routines
2212  */
2213
2214 static caddr_t
2215 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2216         struct smap *smp, enum seg_rw rw)
2217 {
2218         caddr_t base;
2219         page_t  *pp;
2220         int     newpage = 0;
2221         struct kpme     *kpme;
2222
2223         ASSERT(smp->sm_refcnt > 0);
2224
2225         if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
2226                 kmutex_t *smtx;
2227
2228                 base = segkpm_create_va(off);
2229
2230                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
2231                     seg, base)) == NULL) {
2232                         panic("segmap_pagecreate_kpm: "
2233                             "page_create failed");
2234                         /*NOTREACHED*/
2235                 }
2236
2237                 newpage = 1;
2238                 page_io_unlock(pp);
2239                 ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
2240
2241                 /*
2242                  * Mark this here until the following segmap_pagecreate
2243                  * or segmap_release.
2244                  */
2245                 smtx = SMAPMTX(smp);
2246                 mutex_enter(smtx);
2247                 smp->sm_flags |= SM_KPM_NEWPAGE;
2248                 mutex_exit(smtx);
2249         }
2250
2251         kpme = GET_KPME(smp);
2252         if (!newpage && kpme->kpe_page == pp)
2253                 base = hat_kpm_page2va(pp, 0);
2254         else
2255                 base = hat_kpm_mapin(pp, kpme);
2256
2257         /*
2258          * FS code may decide not to call segmap_pagecreate and we
2259          * don't invoke segmap_fault via TLB miss, so we have to set
2260          * ref and mod bits in advance.
2261          */
2262         if (rw == S_WRITE) {
2263                 hat_setrefmod(pp);
2264         } else {
2265                 ASSERT(rw == S_READ);
2266                 hat_setref(pp);
2267         }
2268
2269         smd_cpu[CPU->cpu_seqid].scpu.scpu_pagecreate++;
2270
2271         return (base);
2272 }
2273
2274 /*
2275  * Find the smap structure corresponding to the
2276  * KPM addr and return it locked.
2277  */
2278 struct smap *
2279 get_smap_kpm(caddr_t addr, page_t **ppp)
2280 {
2281         struct smap     *smp;
2282         struct vnode    *vp;
2283         u_offset_t      offset;
2284         caddr_t         baseaddr = (caddr_t)((uintptr_t)addr & MAXBMASK);
2285         int             hashid;
2286         kmutex_t        *hashmtx;
2287         page_t          *pp;
2288         union segmap_cpu *scpu;
2289
2290         pp = hat_kpm_vaddr2page(baseaddr);
2291
2292         ASSERT(pp && !PP_ISFREE(pp));
2293         ASSERT(PAGE_LOCKED(pp));
2294         ASSERT(((uintptr_t)pp->p_offset & MAXBOFFSET) == 0);
2295
2296         vp = pp->p_vnode;
2297         offset = pp->p_offset;
2298         ASSERT(vp != NULL);
2299
2300         /*
2301          * Assume the last smap used on this cpu is the one needed.
2302          */
2303         scpu = smd_cpu+CPU->cpu_seqid;
2304         smp = scpu->scpu.scpu_last_smap;
2305         mutex_enter(&smp->sm_mtx);
2306         if (smp->sm_vp == vp && smp->sm_off == offset) {
2307                 ASSERT(smp->sm_refcnt > 0);
2308         } else {
2309                 /*
2310                  * Assumption wrong, find the smap on the hash chain.
2311                  */
2312                 mutex_exit(&smp->sm_mtx);
2313                 SMAP_HASHFUNC(vp, offset, hashid); /* macro assigns hashid */
2314                 hashmtx = SHASHMTX(hashid);
2315
2316                 mutex_enter(hashmtx);
2317                 smp = smd_hash[hashid].sh_hash_list;
2318                 for (; smp != NULL; smp = smp->sm_hash) {
2319                         if (smp->sm_vp == vp && smp->sm_off == offset)
2320                                 break;
2321                 }
2322                 mutex_exit(hashmtx);
2323                 if (smp) {
2324                         mutex_enter(&smp->sm_mtx);
2325                         ASSERT(smp->sm_vp == vp && smp->sm_off == offset);
2326                 }
2327         }
2328
2329         if (ppp)
2330                 *ppp = smp ? pp : NULL;
2331
2332         return (smp);
2333 }
2334
2335 #else   /* SEGKPM_SUPPORT */
2336
2337 /* segkpm stubs */
2338
2339 /*ARGSUSED*/
2340 static caddr_t
2341 segmap_pagecreate_kpm(struct seg *seg, vnode_t *vp, u_offset_t off,
2342         struct smap *smp, enum seg_rw rw)
2343 {
2344         return (NULL);
2345 }
2346
2347 /*ARGSUSED*/
2348 struct smap *
2349 get_smap_kpm(caddr_t addr, page_t **ppp)
2350 {
2351         return (NULL);
2352 }
2353
2354 #endif  /* SEGKPM_SUPPORT */