usr/src/uts/common/vm/seg_spt.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2019 Joyent, Inc.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/param.h>
  28 #include <sys/user.h>
  29 #include <sys/mman.h>
  30 #include <sys/kmem.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/systm.h>
  34 #include <sys/tuneable.h>
  35 #include <vm/hat.h>
  36 #include <vm/seg.h>
  37 #include <vm/as.h>
  38 #include <vm/anon.h>
  39 #include <vm/page.h>
  40 #include <sys/buf.h>
  41 #include <sys/swap.h>
  42 #include <sys/atomic.h>
  43 #include <vm/seg_spt.h>
  44 #include <sys/debug.h>
  45 #include <sys/vtrace.h>
  46 #include <sys/shm.h>
  47 #include <sys/shm_impl.h>
  48 #include <sys/lgrp.h>
  49 #include <sys/vmsystm.h>
  50 #include <sys/policy.h>
  51 #include <sys/project.h>
  52 #include <sys/zone.h>
  53
  54 #define SEGSPTADDR      (caddr_t)0x0
  55
  56 /*
  57  * # pages used for spt
  58  */
  59 size_t  spt_used;
  60
  61 /*
  62  * See spt_setminfree().
  63  */
  64 pgcnt_t segspt_minfree = 0;
  65 size_t segspt_minfree_clamp = (1UL << 30); /* 1GB in bytes */
  66
  67 static int segspt_create(struct seg **segpp, void *argsp);
  68 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
  69 static void segspt_free(struct seg *seg);
  70 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
  71 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
  72
  73 /* ARGSUSED */
  74 __NORETURN static int
  75 segspt_badop_dup(struct seg *seg __unused, struct seg *newseg __unused)
  76 {
  77         panic("%s called", __func__);
  78 }
  79
  80 /* ARGSUSED */
  81 __NORETURN static faultcode_t
  82 segspt_badop_fault(struct hat *hat, struct seg *seg, caddr_t addr,
  83     size_t len, enum fault_type type, enum seg_rw rw)
  84 {
  85         panic("%s called", __func__);
  86 }
  87
  88 /* ARGSUSED */
  89 __NORETURN static faultcode_t
  90 segspt_badop_faulta(struct seg *seg __unused, caddr_t addr __unused)
  91 {
  92         panic("%s called", __func__);
  93 }
  94
  95 /* ARGSUSED */
  96 __NORETURN static int
  97 segspt_badop_prot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
  98 {
  99         panic("%s called", __func__);
 100 }
 101
 102 /* ARGSUSED */
 103 __NORETURN static int
 104 segspt_badop_checkprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
 105 {
 106         panic("%s called", __func__);
 107 }
 108
 109 /* ARGSUSED */
 110 __NORETURN static int
 111 segspt_badop_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
 112 {
 113         panic("%s called", __func__);
 114 }
 115
 116 /* ARGSUSED */
 117 __NORETURN static size_t
 118 segspt_badop_swapout(struct seg *seg)
 119 {
 120         panic("%s called", __func__);
 121 }
 122
 123 /* ARGSUSED */
 124 __NORETURN static int
 125 segspt_badop_sync(struct seg *seg, caddr_t addr, size_t len, int attr,
 126     uint_t flags)
 127 {
 128         panic("%s called", __func__);
 129 }
 130
 131 /* ARGSUSED */
 132 __NORETURN
 133 static size_t
 134 segspt_badop_incore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 135 {
 136         panic("%s called", __func__);
 137 }
 138
 139 /* ARGSUSED */
 140 __NORETURN static int
 141 segspt_badop_lockop(struct seg *seg, caddr_t addr, size_t len, int attr,
 142     int op, ulong_t *lockmap, size_t pos)
 143 {
 144         panic("%s called", __func__);
 145 }
 146
 147 /* ARGSUSED */
 148 __NORETURN static int
 149 segspt_badop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
 150 {
 151         panic("%s called", __func__);
 152 }
 153
 154 /* ARGSUSED */
 155 __NORETURN static u_offset_t
 156 segspt_badop_getoffset(struct seg *seg, caddr_t addr)
 157 {
 158         panic("%s called", __func__);
 159 }
 160
 161 /* ARGSUSED */
 162 __NORETURN static int
 163 segspt_badop_gettype(struct seg *seg, caddr_t addr)
 164 {
 165         panic("%s called", __func__);
 166 }
 167
 168 /* ARGSUSED */
 169 __NORETURN static int
 170 segspt_badop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
 171 {
 172         panic("%s called", __func__);
 173 }
 174
 175 /* ARGSUSED */
 176 __NORETURN static int
 177 segspt_badop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
 178 {
 179         panic("%s called", __func__);
 180 }
 181
 182 /* ARGSUSED */
 183 __NORETURN static void
 184 segspt_badop_dump(struct seg *seg)
 185 {
 186         panic("%s called", __func__);
 187 }
 188
 189 /* ARGSUSED */
 190 __NORETURN static int
 191 segspt_badop_pagelock(struct seg *seg, caddr_t addr, size_t len,
 192     struct page ***ppp, enum lock_type type, enum seg_rw rw)
 193 {
 194         panic("%s called", __func__);
 195 }
 196
 197 /* ARGSUSED */
 198 __NORETURN static int
 199 segspt_badop_setpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
 200 {
 201         panic("%s called", __func__);
 202 }
 203
 204 /* ARGSUSED */
 205 __NORETURN static int
 206 segspt_badop_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
 207 {
 208         panic("%s called", __func__);
 209 }
 210
 211 /* ARGSUSED */
 212 __NORETURN static int
 213 segspt_badop_capable(struct seg *seg, segcapability_t capability)
 214 {
 215         panic("%s called", __func__);
 216 }
 217
 218 struct seg_ops segspt_ops = {
 219         segspt_badop_dup,               /* dup */
 220         segspt_unmap,
 221         segspt_free,
 222         segspt_badop_fault,             /* fault */
 223         segspt_badop_faulta,            /* faulta */
 224         segspt_badop_prot,              /* setprot */
 225         segspt_badop_checkprot,         /* checkprot */
 226         segspt_badop_kluster,           /* kluster */
 227         segspt_badop_swapout,           /* swapout */
 228         segspt_badop_sync,              /* sync */
 229         segspt_badop_incore,            /* incore */
 230         segspt_badop_lockop,            /* lockop */
 231         segspt_badop_getprot,           /* getprot */
 232         segspt_badop_getoffset,         /* getoffset */
 233         segspt_badop_gettype,           /* gettype */
 234         segspt_badop_getvp,             /* getvp */
 235         segspt_badop_advise,            /* advise */
 236         segspt_badop_dump,              /* dump */
 237         segspt_badop_pagelock,          /* pagelock */
 238         segspt_badop_setpgsz,           /* setpgsz */
 239         segspt_badop_getmemid,          /* getmemid */
 240         segspt_getpolicy,               /* getpolicy */
 241         segspt_badop_capable,           /* capable */
 242         seg_inherit_notsup              /* inherit */
 243 };
 244
 245 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
 246 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
 247 static void segspt_shmfree(struct seg *seg);
 248 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
 249                 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
 250 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
 251 static int segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len,
 252                 uint_t prot);
 253 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
 254                 uint_t prot);
 255 static int      segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
 256 static size_t   segspt_shmswapout(struct seg *seg);
 257 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
 258                 char *vec);
 259 static int segspt_shmsync(struct seg *seg, caddr_t addr, size_t len,
 260                 int attr, uint_t flags);
 261 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
 262                 int attr, int op, ulong_t *lockmap, size_t pos);
 263 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
 264                 uint_t *protv);
 265 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
 266 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
 267 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
 268 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
 269                 uint_t behav);
 270 static void segspt_shmdump(struct seg *seg);
 271 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
 272                 struct page ***, enum lock_type, enum seg_rw);
 273 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
 274 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
 275 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
 276 static int segspt_shmcapable(struct seg *, segcapability_t);
 277
 278 struct seg_ops segspt_shmops = {
 279         segspt_shmdup,
 280         segspt_shmunmap,
 281         segspt_shmfree,
 282         segspt_shmfault,
 283         segspt_shmfaulta,
 284         segspt_shmsetprot,
 285         segspt_shmcheckprot,
 286         segspt_shmkluster,
 287         segspt_shmswapout,
 288         segspt_shmsync,
 289         segspt_shmincore,
 290         segspt_shmlockop,
 291         segspt_shmgetprot,
 292         segspt_shmgetoffset,
 293         segspt_shmgettype,
 294         segspt_shmgetvp,
 295         segspt_shmadvise,       /* advise */
 296         segspt_shmdump,
 297         segspt_shmpagelock,
 298         segspt_shmsetpgsz,
 299         segspt_shmgetmemid,
 300         segspt_shmgetpolicy,
 301         segspt_shmcapable,
 302         seg_inherit_notsup
 303 };
 304
 305 static void segspt_purge(struct seg *seg);
 306 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
 307                 enum seg_rw, int);
 308 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
 309                 page_t **ppa);
 310
 311 /*
 312  * This value corresponds to headroom in availrmem that ISM can never allocate
 313  * (but others can).  The original intent here was to prevent ISM from locking
 314  * all of the remaining availrmem into memory, making forward progress
 315  * difficult. It's not clear how much this matters on modern systems.
 316  *
 317  * The traditional default value of 5% of total memory is used, except on
 318  * systems where that quickly gets ridiculous: in that case we clamp at a rather
 319  * arbitrary value of 1GB.
 320  *
 321  * Note that since this is called lazily on the first sptcreate(), in theory,
 322  * this could represent a very small value if the system is heavily loaded
 323  * already. In practice, the first ISM user is pretty likely to come along
 324  * earlier during the system's operation.
 325  *
 326  * This never gets re-figured.
 327  */
 328 static void
 329 spt_setminfree(void)
 330 {
 331         segspt_minfree = availrmem / 20;
 332
 333         if (segspt_minfree_clamp != 0 &&
 334             segspt_minfree > (segspt_minfree_clamp / PAGESIZE))
 335                 segspt_minfree = segspt_minfree_clamp / PAGESIZE;
 336 }
 337
 338 int
 339 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
 340     uint_t prot, uint_t flags, uint_t share_szc)
 341 {
 342         int     err;
 343         struct  as      *newas;
 344         struct  segspt_crargs sptcargs;
 345
 346         if (segspt_minfree == 0)
 347                 spt_setminfree();
 348
 349         if (!hat_supported(HAT_SHARED_PT, (void *)0))
 350                 return (EINVAL);
 351
 352         /*
 353          * get a new as for this shared memory segment
 354          */
 355         newas = as_alloc();
 356         newas->a_proc = NULL;
 357         sptcargs.amp = amp;
 358         sptcargs.prot = prot;
 359         sptcargs.flags = flags;
 360         sptcargs.szc = share_szc;
 361         /*
 362          * create a shared page table (spt) segment
 363          */
 364
 365         if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
 366                 as_free(newas);
 367                 return (err);
 368         }
 369         *sptseg = sptcargs.seg_spt;
 370         return (0);
 371 }
 372
 373 void
 374 sptdestroy(struct as *as, struct anon_map *amp)
 375 {
 376
 377         (void) as_unmap(as, SEGSPTADDR, amp->size);
 378         as_free(as);
 379 }
 380
 381 /*
 382  * called from seg_free().
 383  * free (i.e., unlock, unmap, return to free list)
 384  *  all the pages in the given seg.
 385  */
 386 void
 387 segspt_free(struct seg  *seg)
 388 {
 389         struct spt_data *sptd = (struct spt_data *)seg->s_data;
 390
 391         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 392
 393         if (sptd != NULL) {
 394                 if (sptd->spt_realsize)
 395                         segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
 396
 397                 if (sptd->spt_ppa_lckcnt) {
 398                         kmem_free(sptd->spt_ppa_lckcnt,
 399                             sizeof (*sptd->spt_ppa_lckcnt)
 400                             * btopr(sptd->spt_amp->size));
 401                 }
 402                 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
 403                 cv_destroy(&sptd->spt_cv);
 404                 mutex_destroy(&sptd->spt_lock);
 405                 kmem_free(sptd, sizeof (*sptd));
 406         }
 407 }
 408
 409 /*ARGSUSED*/
 410 static int
 411 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
 412     uint_t flags)
 413 {
 414         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 415
 416         return (0);
 417 }
 418
 419 /*ARGSUSED*/
 420 static size_t
 421 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
 422 {
 423         caddr_t eo_seg;
 424         pgcnt_t npages;
 425         struct shm_data *shmd = (struct shm_data *)seg->s_data;
 426         struct seg      *sptseg;
 427         struct spt_data *sptd;
 428
 429         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 430 #ifdef lint
 431         seg = seg;
 432 #endif
 433         sptseg = shmd->shm_sptseg;
 434         sptd = sptseg->s_data;
 435
 436         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 437                 eo_seg = addr + len;
 438                 while (addr < eo_seg) {
 439                         /* page exists, and it's locked. */
 440                         *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
 441                             SEG_PAGE_ANON;
 442                         addr += PAGESIZE;
 443                 }
 444                 return (len);
 445         } else {
 446                 struct  anon_map *amp = shmd->shm_amp;
 447                 struct  anon    *ap;
 448                 page_t          *pp;
 449                 pgcnt_t         anon_index;
 450                 struct vnode    *vp;
 451                 u_offset_t      off;
 452                 ulong_t         i;
 453                 int             ret;
 454                 anon_sync_obj_t cookie;
 455
 456                 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 457                 anon_index = seg_page(seg, addr);
 458                 npages = btopr(len);
 459                 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
 460                         return (EINVAL);
 461                 }
 462                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
 463                 for (i = 0; i < npages; i++, anon_index++) {
 464                         ret = 0;
 465                         anon_array_enter(amp, anon_index, &cookie);
 466                         ap = anon_get_ptr(amp->ahp, anon_index);
 467                         if (ap != NULL) {
 468                                 swap_xlate(ap, &vp, &off);
 469                                 anon_array_exit(&cookie);
 470                                 pp = page_lookup_nowait(vp, off, SE_SHARED);
 471                                 if (pp != NULL) {
 472                                         ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
 473                                         page_unlock(pp);
 474                                 }
 475                         } else {
 476                                 anon_array_exit(&cookie);
 477                         }
 478                         if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
 479                                 ret |= SEG_PAGE_LOCKED;
 480                         }
 481                         *vec++ = (char)ret;
 482                 }
 483                 ANON_LOCK_EXIT(&amp->a_rwlock);
 484                 return (len);
 485         }
 486 }
 487
 488 static int
 489 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
 490 {
 491         size_t share_size;
 492
 493         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 494
 495         /*
 496          * seg.s_size may have been rounded up to the largest page size
 497          * in shmat().
 498          * XXX This should be cleanedup. sptdestroy should take a length
 499          * argument which should be the same as sptcreate. Then
 500          * this rounding would not be needed (or is done in shm.c)
 501          * Only the check for full segment will be needed.
 502          *
 503          * XXX -- shouldn't raddr == 0 always? These tests don't seem
 504          * to be useful at all.
 505          */
 506         share_size = page_get_pagesize(seg->s_szc);
 507         ssize = P2ROUNDUP(ssize, share_size);
 508
 509         if (raddr == seg->s_base && ssize == seg->s_size) {
 510                 seg_free(seg);
 511                 return (0);
 512         } else
 513                 return (EINVAL);
 514 }
 515
 516 int
 517 segspt_create(struct seg **segpp, void *argsp)
 518 {
 519         struct seg      *seg = *segpp;
 520         int             err;
 521         caddr_t         addr = seg->s_base;
 522         struct spt_data *sptd;
 523         struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
 524         struct anon_map *amp = sptcargs->amp;
 525         struct kshmid   *sp = amp->a_sp;
 526         struct  cred    *cred = CRED();
 527         ulong_t         i, j, anon_index = 0;
 528         pgcnt_t         npages = btopr(amp->size);
 529         struct vnode    *vp;
 530         page_t          **ppa;
 531         uint_t          hat_flags;
 532         size_t          pgsz;
 533         pgcnt_t         pgcnt;
 534         caddr_t         a;
 535         pgcnt_t         pidx;
 536         size_t          sz;
 537         proc_t          *procp = curproc;
 538         rctl_qty_t      lockedbytes = 0;
 539         kproject_t      *proj;
 540
 541         /*
 542          * We are holding the a_lock on the underlying dummy as,
 543          * so we can make calls to the HAT layer.
 544          */
 545         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 546         ASSERT(sp != NULL);
 547
 548         if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 549                 if (err = anon_swap_adjust(npages))
 550                         return (err);
 551         }
 552         err = ENOMEM;
 553
 554         if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
 555                 goto out1;
 556
 557         ppa = NULL;
 558         if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
 559                 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
 560                     KM_NOSLEEP)) == NULL)
 561                         goto out2;
 562         }
 563
 564         mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
 565
 566         if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
 567                 goto out3;
 568
 569         seg->s_ops = &segspt_ops;
 570         sptd->spt_vp = vp;
 571         sptd->spt_amp = amp;
 572         sptd->spt_prot = sptcargs->prot;
 573         sptd->spt_flags = sptcargs->flags;
 574         seg->s_data = (caddr_t)sptd;
 575         sptd->spt_ppa = NULL;
 576         sptd->spt_ppa_lckcnt = NULL;
 577         seg->s_szc = sptcargs->szc;
 578         cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
 579         sptd->spt_gen = 0;
 580
 581         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 582         if (seg->s_szc > amp->a_szc) {
 583                 amp->a_szc = seg->s_szc;
 584         }
 585         ANON_LOCK_EXIT(&amp->a_rwlock);
 586
 587         /*
 588          * Set policy to affect initial allocation of pages in
 589          * anon_map_createpages()
 590          */
 591         (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
 592             NULL, 0, ptob(npages));
 593
 594         if (sptcargs->flags & SHM_PAGEABLE) {
 595                 size_t  share_sz;
 596                 pgcnt_t new_npgs, more_pgs;
 597                 struct anon_hdr *nahp;
 598                 zone_t *zone;
 599
 600                 share_sz = page_get_pagesize(seg->s_szc);
 601                 if (!IS_P2ALIGNED(amp->size, share_sz)) {
 602                         /*
 603                          * We are rounding up the size of the anon array
 604                          * on 4 M boundary because we always create 4 M
 605                          * of page(s) when locking, faulting pages and we
 606                          * don't have to check for all corner cases e.g.
 607                          * if there is enough space to allocate 4 M
 608                          * page.
 609                          */
 610                         new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
 611                         more_pgs = new_npgs - npages;
 612
 613                         /*
 614                          * The zone will never be NULL, as a fully created
 615                          * shm always has an owning zone.
 616                          */
 617                         zone = sp->shm_perm.ipc_zone_ref.zref_zone;
 618                         ASSERT(zone != NULL);
 619                         if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
 620                                 err = ENOMEM;
 621                                 goto out4;
 622                         }
 623
 624                         nahp = anon_create(new_npgs, ANON_SLEEP);
 625                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 626                         (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
 627                             ANON_SLEEP);
 628                         anon_release(amp->ahp, npages);
 629                         amp->ahp = nahp;
 630                         ASSERT(amp->swresv == ptob(npages));
 631                         amp->swresv = amp->size = ptob(new_npgs);
 632                         ANON_LOCK_EXIT(&amp->a_rwlock);
 633                         npages = new_npgs;
 634                 }
 635
 636                 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
 637                     sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
 638                 sptd->spt_pcachecnt = 0;
 639                 sptd->spt_realsize = ptob(npages);
 640                 sptcargs->seg_spt = seg;
 641                 return (0);
 642         }
 643
 644         /*
 645          * get array of pages for each anon slot in amp
 646          */
 647         if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
 648             seg, addr, S_CREATE, cred)) != 0)
 649                 goto out4;
 650
 651         mutex_enter(&sp->shm_mlock);
 652
 653         /* May be partially locked, so, count bytes to charge for locking */
 654         for (i = 0; i < npages; i++)
 655                 if (ppa[i]->p_lckcnt == 0)
 656                         lockedbytes += PAGESIZE;
 657
 658         proj = sp->shm_perm.ipc_proj;
 659
 660         if (lockedbytes > 0) {
 661                 mutex_enter(&procp->p_lock);
 662                 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
 663                         mutex_exit(&procp->p_lock);
 664                         mutex_exit(&sp->shm_mlock);
 665                         for (i = 0; i < npages; i++)
 666                                 page_unlock(ppa[i]);
 667                         err = ENOMEM;
 668                         goto out4;
 669                 }
 670                 mutex_exit(&procp->p_lock);
 671         }
 672
 673         /*
 674          * addr is initial address corresponding to the first page on ppa list
 675          */
 676         for (i = 0; i < npages; i++) {
 677                 /* attempt to lock all pages */
 678                 if (page_pp_lock(ppa[i], 0, 1) == 0) {
 679                         /*
 680                          * if unable to lock any page, unlock all
 681                          * of them and return error
 682                          */
 683                         for (j = 0; j < i; j++)
 684                                 page_pp_unlock(ppa[j], 0, 1);
 685                         for (i = 0; i < npages; i++)
 686                                 page_unlock(ppa[i]);
 687                         rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
 688                         mutex_exit(&sp->shm_mlock);
 689                         err = ENOMEM;
 690                         goto out4;
 691                 }
 692         }
 693         mutex_exit(&sp->shm_mlock);
 694
 695         /*
 696          * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
 697          * for the entire life of the segment. For example platforms
 698          * that do not support Dynamic Reconfiguration.
 699          */
 700         hat_flags = HAT_LOAD_SHARE;
 701         if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
 702                 hat_flags |= HAT_LOAD_LOCK;
 703
 704         /*
 705          * Load translations one lare page at a time
 706          * to make sure we don't create mappings bigger than
 707          * segment's size code in case underlying pages
 708          * are shared with segvn's segment that uses bigger
 709          * size code than we do.
 710          */
 711         pgsz = page_get_pagesize(seg->s_szc);
 712         pgcnt = page_get_pagecnt(seg->s_szc);
 713         for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
 714                 sz = MIN(pgsz, ptob(npages - pidx));
 715                 hat_memload_array(seg->s_as->a_hat, a, sz,
 716                     &ppa[pidx], sptd->spt_prot, hat_flags);
 717         }
 718
 719         /*
 720          * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 721          * we will leave the pages locked SE_SHARED for the life
 722          * of the ISM segment. This will prevent any calls to
 723          * hat_pageunload() on this ISM segment for those platforms.
 724          */
 725         if (!(hat_flags & HAT_LOAD_LOCK)) {
 726                 /*
 727                  * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
 728                  * we no longer need to hold the SE_SHARED lock on the pages,
 729                  * since L_PAGELOCK and F_SOFTLOCK calls will grab the
 730                  * SE_SHARED lock on the pages as necessary.
 731                  */
 732                 for (i = 0; i < npages; i++)
 733                         page_unlock(ppa[i]);
 734         }
 735         sptd->spt_pcachecnt = 0;
 736         kmem_free(ppa, ((sizeof (page_t *)) * npages));
 737         sptd->spt_realsize = ptob(npages);
 738         atomic_add_long(&spt_used, npages);
 739         sptcargs->seg_spt = seg;
 740         return (0);
 741
 742 out4:
 743         seg->s_data = NULL;
 744         kmem_free(vp, sizeof (*vp));
 745         cv_destroy(&sptd->spt_cv);
 746 out3:
 747         mutex_destroy(&sptd->spt_lock);
 748         if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 749                 kmem_free(ppa, (sizeof (*ppa) * npages));
 750 out2:
 751         kmem_free(sptd, sizeof (*sptd));
 752 out1:
 753         if ((sptcargs->flags & SHM_PAGEABLE) == 0)
 754                 anon_swap_restore(npages);
 755         return (err);
 756 }
 757
 758 /*ARGSUSED*/
 759 void
 760 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
 761 {
 762         struct page     *pp;
 763         struct spt_data *sptd = (struct spt_data *)seg->s_data;
 764         pgcnt_t         npages;
 765         ulong_t         anon_idx;
 766         struct anon_map *amp;
 767         struct anon     *ap;
 768         struct vnode    *vp;
 769         u_offset_t      off;
 770         uint_t          hat_flags;
 771         int             root = 0;
 772         pgcnt_t         pgs, curnpgs = 0;
 773         page_t          *rootpp;
 774         rctl_qty_t      unlocked_bytes = 0;
 775         kproject_t      *proj;
 776         kshmid_t        *sp;
 777
 778         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
 779
 780         len = P2ROUNDUP(len, PAGESIZE);
 781
 782         npages = btop(len);
 783
 784         hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
 785         if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
 786             (sptd->spt_flags & SHM_PAGEABLE)) {
 787                 hat_flags = HAT_UNLOAD_UNMAP;
 788         }
 789
 790         hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
 791
 792         amp = sptd->spt_amp;
 793         if (sptd->spt_flags & SHM_PAGEABLE)
 794                 npages = btop(amp->size);
 795
 796         ASSERT(amp != NULL);
 797
 798         proj = NULL;
 799         rootpp = NULL;
 800         sp = NULL;
 801         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 802                 sp = amp->a_sp;
 803                 proj = sp->shm_perm.ipc_proj;
 804                 mutex_enter(&sp->shm_mlock);
 805         }
 806         for (anon_idx = 0; anon_idx < npages; anon_idx++) {
 807                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 808                         if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
 809                                 panic("segspt_free_pages: null app");
 810                                 /*NOTREACHED*/
 811                         }
 812                 } else {
 813                         if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
 814                             == NULL)
 815                                 continue;
 816                 }
 817                 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
 818                 swap_xlate(ap, &vp, &off);
 819
 820                 /*
 821                  * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
 822                  * the pages won't be having SE_SHARED lock at this
 823                  * point.
 824                  *
 825                  * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
 826                  * the pages are still held SE_SHARED locked from the
 827                  * original segspt_create()
 828                  *
 829                  * Our goal is to get SE_EXCL lock on each page, remove
 830                  * permanent lock on it and invalidate the page.
 831                  */
 832                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 833                         if (hat_flags == HAT_UNLOAD_UNMAP)
 834                                 pp = page_lookup(vp, off, SE_EXCL);
 835                         else {
 836                                 if ((pp = page_find(vp, off)) == NULL) {
 837                                         panic("segspt_free_pages: "
 838                                             "page not locked");
 839                                         /*NOTREACHED*/
 840                                 }
 841                                 if (!page_tryupgrade(pp)) {
 842                                         page_unlock(pp);
 843                                         pp = page_lookup(vp, off, SE_EXCL);
 844                                 }
 845                         }
 846                         if (pp == NULL) {
 847                                 panic("segspt_free_pages: "
 848                                     "page not in the system");
 849                                 /*NOTREACHED*/
 850                         }
 851                         ASSERT(pp->p_lckcnt > 0);
 852                         page_pp_unlock(pp, 0, 1);
 853                         if (pp->p_lckcnt == 0)
 854                                 unlocked_bytes += PAGESIZE;
 855                 } else {
 856                         if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
 857                                 continue;
 858                 }
 859                 /*
 860                  * It's logical to invalidate the pages here as in most cases
 861                  * these were created by segspt.
 862                  */
 863                 if (pp->p_szc != 0) {
 864                         if (root == 0) {
 865                                 ASSERT(curnpgs == 0);
 866                                 root = 1;
 867                                 rootpp = pp;
 868                                 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
 869                                 ASSERT(pgs > 1);
 870                                 ASSERT(IS_P2ALIGNED(pgs, pgs));
 871                                 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
 872                                 curnpgs--;
 873                         } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
 874                                 ASSERT(curnpgs == 1);
 875                                 ASSERT(page_pptonum(pp) ==
 876                                     page_pptonum(rootpp) + (pgs - 1));
 877                                 page_destroy_pages(rootpp);
 878                                 root = 0;
 879                                 curnpgs = 0;
 880                         } else {
 881                                 ASSERT(curnpgs > 1);
 882                                 ASSERT(page_pptonum(pp) ==
 883                                     page_pptonum(rootpp) + (pgs - curnpgs));
 884                                 curnpgs--;
 885                         }
 886                 } else {
 887                         if (root != 0 || curnpgs != 0) {
 888                                 panic("segspt_free_pages: bad large page");
 889                                 /*NOTREACHED*/
 890                         }
 891                         /*
 892                          * Before destroying the pages, we need to take care
 893                          * of the rctl locked memory accounting. For that
 894                          * we need to calculte the unlocked_bytes.
 895                          */
 896                         if (pp->p_lckcnt > 0)
 897                                 unlocked_bytes += PAGESIZE;
 898                         /*LINTED: constant in conditional context */
 899                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 900                 }
 901         }
 902         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 903                 if (unlocked_bytes > 0)
 904                         rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
 905                 mutex_exit(&sp->shm_mlock);
 906         }
 907         if (root != 0 || curnpgs != 0) {
 908                 panic("segspt_free_pages: bad large page");
 909                 /*NOTREACHED*/
 910         }
 911
 912         /*
 913          * mark that pages have been released
 914          */
 915         sptd->spt_realsize = 0;
 916
 917         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
 918                 atomic_add_long(&spt_used, -npages);
 919                 anon_swap_restore(npages);
 920         }
 921 }
 922
 923 /*
 924  * Get memory allocation policy info for specified address in given segment
 925  */
 926 static lgrp_mem_policy_info_t *
 927 segspt_getpolicy(struct seg *seg, caddr_t addr)
 928 {
 929         struct anon_map         *amp;
 930         ulong_t                 anon_index;
 931         lgrp_mem_policy_info_t  *policy_info;
 932         struct spt_data         *spt_data;
 933
 934         ASSERT(seg != NULL);
 935
 936         /*
 937          * Get anon_map from segspt
 938          *
 939          * Assume that no lock needs to be held on anon_map, since
 940          * it should be protected by its reference count which must be
 941          * nonzero for an existing segment
 942          * Need to grab readers lock on policy tree though
 943          */
 944         spt_data = (struct spt_data *)seg->s_data;
 945         if (spt_data == NULL)
 946                 return (NULL);
 947         amp = spt_data->spt_amp;
 948         ASSERT(amp->refcnt != 0);
 949
 950         /*
 951          * Get policy info
 952          *
 953          * Assume starting anon index of 0
 954          */
 955         anon_index = seg_page(seg, addr);
 956         policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
 957
 958         return (policy_info);
 959 }
 960
 961 /*
 962  * DISM only.
 963  * Return locked pages over a given range.
 964  *
 965  * We will cache all DISM locked pages and save the pplist for the
 966  * entire segment in the ppa field of the underlying DISM segment structure.
 967  * Later, during a call to segspt_reclaim() we will use this ppa array
 968  * to page_unlock() all of the pages and then we will free this ppa list.
 969  */
 970 /*ARGSUSED*/
 971 static int
 972 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
 973     struct page ***ppp, enum lock_type type, enum seg_rw rw)
 974 {
 975         struct  shm_data *shmd = (struct shm_data *)seg->s_data;
 976         struct  seg     *sptseg = shmd->shm_sptseg;
 977         struct  spt_data *sptd = sptseg->s_data;
 978         pgcnt_t pg_idx, npages, tot_npages, npgs;
 979         struct  page **pplist, **pl, **ppa, *pp;
 980         struct  anon_map *amp;
 981         spgcnt_t        an_idx;
 982         int     ret = ENOTSUP;
 983         uint_t  pl_built = 0;
 984         struct  anon *ap;
 985         struct  vnode *vp;
 986         u_offset_t off;
 987         pgcnt_t claim_availrmem = 0;
 988         uint_t  szc;
 989
 990         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
 991         ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
 992
 993         /*
 994          * We want to lock/unlock the entire ISM segment. Therefore,
 995          * we will be using the underlying sptseg and it's base address
 996          * and length for the caching arguments.
 997          */
 998         ASSERT(sptseg);
 999         ASSERT(sptd);
1000
1001         pg_idx = seg_page(seg, addr);
1002         npages = btopr(len);
1003
1004         /*
1005          * check if the request is larger than number of pages covered
1006          * by amp
1007          */
1008         if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
1009                 *ppp = NULL;
1010                 return (ENOTSUP);
1011         }
1012
1013         if (type == L_PAGEUNLOCK) {
1014                 ASSERT(sptd->spt_ppa != NULL);
1015
1016                 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1017                     sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1018
1019                 /*
1020                  * If someone is blocked while unmapping, we purge
1021                  * segment page cache and thus reclaim pplist synchronously
1022                  * without waiting for seg_pasync_thread. This speeds up
1023                  * unmapping in cases where munmap(2) is called, while
1024                  * raw async i/o is still in progress or where a thread
1025                  * exits on data fault in a multithreaded application.
1026                  */
1027                 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
1028                     (AS_ISUNMAPWAIT(seg->s_as) &&
1029                     shmd->shm_softlockcnt > 0)) {
1030                         segspt_purge(seg);
1031                 }
1032                 return (0);
1033         }
1034
1035         /* The L_PAGELOCK case ... */
1036
1037         if (sptd->spt_flags & DISM_PPA_CHANGED) {
1038                 segspt_purge(seg);
1039                 /*
1040                  * for DISM ppa needs to be rebuild since
1041                  * number of locked pages could be changed
1042                  */
1043                 *ppp = NULL;
1044                 return (ENOTSUP);
1045         }
1046
1047         /*
1048          * First try to find pages in segment page cache, without
1049          * holding the segment lock.
1050          */
1051         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1052             S_WRITE, SEGP_FORCE_WIRED);
1053         if (pplist != NULL) {
1054                 ASSERT(sptd->spt_ppa != NULL);
1055                 ASSERT(sptd->spt_ppa == pplist);
1056                 ppa = sptd->spt_ppa;
1057                 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1058                         if (ppa[an_idx] == NULL) {
1059                                 seg_pinactive(seg, NULL, seg->s_base,
1060                                     sptd->spt_amp->size, ppa,
1061                                     S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1062                                 *ppp = NULL;
1063                                 return (ENOTSUP);
1064                         }
1065                         if ((szc = ppa[an_idx]->p_szc) != 0) {
1066                                 npgs = page_get_pagecnt(szc);
1067                                 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1068                         } else {
1069                                 an_idx++;
1070                         }
1071                 }
1072                 /*
1073                  * Since we cache the entire DISM segment, we want to
1074                  * set ppp to point to the first slot that corresponds
1075                  * to the requested addr, i.e. pg_idx.
1076                  */
1077                 *ppp = &(sptd->spt_ppa[pg_idx]);
1078                 return (0);
1079         }
1080
1081         mutex_enter(&sptd->spt_lock);
1082         /*
1083          * try to find pages in segment page cache with mutex
1084          */
1085         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1086             S_WRITE, SEGP_FORCE_WIRED);
1087         if (pplist != NULL) {
1088                 ASSERT(sptd->spt_ppa != NULL);
1089                 ASSERT(sptd->spt_ppa == pplist);
1090                 ppa = sptd->spt_ppa;
1091                 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1092                         if (ppa[an_idx] == NULL) {
1093                                 mutex_exit(&sptd->spt_lock);
1094                                 seg_pinactive(seg, NULL, seg->s_base,
1095                                     sptd->spt_amp->size, ppa,
1096                                     S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1097                                 *ppp = NULL;
1098                                 return (ENOTSUP);
1099                         }
1100                         if ((szc = ppa[an_idx]->p_szc) != 0) {
1101                                 npgs = page_get_pagecnt(szc);
1102                                 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1103                         } else {
1104                                 an_idx++;
1105                         }
1106                 }
1107                 /*
1108                  * Since we cache the entire DISM segment, we want to
1109                  * set ppp to point to the first slot that corresponds
1110                  * to the requested addr, i.e. pg_idx.
1111                  */
1112                 mutex_exit(&sptd->spt_lock);
1113                 *ppp = &(sptd->spt_ppa[pg_idx]);
1114                 return (0);
1115         }
1116         if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1117             SEGP_FORCE_WIRED) == SEGP_FAIL) {
1118                 mutex_exit(&sptd->spt_lock);
1119                 *ppp = NULL;
1120                 return (ENOTSUP);
1121         }
1122
1123         /*
1124          * No need to worry about protections because DISM pages are always rw.
1125          */
1126         pl = pplist = NULL;
1127         amp = sptd->spt_amp;
1128
1129         /*
1130          * Do we need to build the ppa array?
1131          */
1132         if (sptd->spt_ppa == NULL) {
1133                 pgcnt_t lpg_cnt = 0;
1134
1135                 pl_built = 1;
1136                 tot_npages = btopr(sptd->spt_amp->size);
1137
1138                 ASSERT(sptd->spt_pcachecnt == 0);
1139                 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
1140                 pl = pplist;
1141
1142                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1143                 for (an_idx = 0; an_idx < tot_npages; ) {
1144                         ap = anon_get_ptr(amp->ahp, an_idx);
1145                         /*
1146                          * Cache only mlocked pages. For large pages
1147                          * if one (constituent) page is mlocked
1148                          * all pages for that large page
1149                          * are cached also. This is for quick
1150                          * lookups of ppa array;
1151                          */
1152                         if ((ap != NULL) && (lpg_cnt != 0 ||
1153                             (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1154
1155                                 swap_xlate(ap, &vp, &off);
1156                                 pp = page_lookup(vp, off, SE_SHARED);
1157                                 ASSERT(pp != NULL);
1158                                 if (lpg_cnt == 0) {
1159                                         lpg_cnt++;
1160                                         /*
1161                                          * For a small page, we are done --
1162                                          * lpg_count is reset to 0 below.
1163                                          *
1164                                          * For a large page, we are guaranteed
1165                                          * to find the anon structures of all
1166                                          * constituent pages and a non-zero
1167                                          * lpg_cnt ensures that we don't test
1168                                          * for mlock for these. We are done
1169                                          * when lpg_count reaches (npgs + 1).
1170                                          * If we are not the first constituent
1171                                          * page, restart at the first one.
1172                                          */
1173                                         npgs = page_get_pagecnt(pp->p_szc);
1174                                         if (!IS_P2ALIGNED(an_idx, npgs)) {
1175                                                 an_idx = P2ALIGN(an_idx, npgs);
1176                                                 page_unlock(pp);
1177                                                 continue;
1178                                         }
1179                                 }
1180                                 if (++lpg_cnt > npgs)
1181                                         lpg_cnt = 0;
1182
1183                                 /*
1184                                  * availrmem is decremented only
1185                                  * for unlocked pages
1186                                  */
1187                                 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1188                                         claim_availrmem++;
1189                                 pplist[an_idx] = pp;
1190                         }
1191                         an_idx++;
1192                 }
1193                 ANON_LOCK_EXIT(&amp->a_rwlock);
1194
1195                 if (claim_availrmem) {
1196                         mutex_enter(&freemem_lock);
1197                         if (availrmem < tune.t_minarmem + claim_availrmem) {
1198                                 mutex_exit(&freemem_lock);
1199                                 ret = ENOTSUP;
1200                                 claim_availrmem = 0;
1201                                 goto insert_fail;
1202                         } else {
1203                                 availrmem -= claim_availrmem;
1204                         }
1205                         mutex_exit(&freemem_lock);
1206                 }
1207
1208                 sptd->spt_ppa = pl;
1209         } else {
1210                 /*
1211                  * We already have a valid ppa[].
1212                  */
1213                 pl = sptd->spt_ppa;
1214         }
1215
1216         ASSERT(pl != NULL);
1217
1218         ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1219             sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1220             segspt_reclaim);
1221         if (ret == SEGP_FAIL) {
1222                 /*
1223                  * seg_pinsert failed. We return
1224                  * ENOTSUP, so that the as_pagelock() code will
1225                  * then try the slower F_SOFTLOCK path.
1226                  */
1227                 if (pl_built) {
1228                         /*
1229                          * No one else has referenced the ppa[].
1230                          * We created it and we need to destroy it.
1231                          */
1232                         sptd->spt_ppa = NULL;
1233                 }
1234                 ret = ENOTSUP;
1235                 goto insert_fail;
1236         }
1237
1238         /*
1239          * In either case, we increment softlockcnt on the 'real' segment.
1240          */
1241         sptd->spt_pcachecnt++;
1242         atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1243
1244         ppa = sptd->spt_ppa;
1245         for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1246                 if (ppa[an_idx] == NULL) {
1247                         mutex_exit(&sptd->spt_lock);
1248                         seg_pinactive(seg, NULL, seg->s_base,
1249                             sptd->spt_amp->size,
1250                             pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1251                         *ppp = NULL;
1252                         return (ENOTSUP);
1253                 }
1254                 if ((szc = ppa[an_idx]->p_szc) != 0) {
1255                         npgs = page_get_pagecnt(szc);
1256                         an_idx = P2ROUNDUP(an_idx + 1, npgs);
1257                 } else {
1258                         an_idx++;
1259                 }
1260         }
1261         /*
1262          * We can now drop the sptd->spt_lock since the ppa[]
1263          * exists and we have incremented pacachecnt.
1264          */
1265         mutex_exit(&sptd->spt_lock);
1266
1267         /*
1268          * Since we cache the entire segment, we want to
1269          * set ppp to point to the first slot that corresponds
1270          * to the requested addr, i.e. pg_idx.
1271          */
1272         *ppp = &(sptd->spt_ppa[pg_idx]);
1273         return (0);
1274
1275 insert_fail:
1276         /*
1277          * We will only reach this code if we tried and failed.
1278          *
1279          * And we can drop the lock on the dummy seg, once we've failed
1280          * to set up a new ppa[].
1281          */
1282         mutex_exit(&sptd->spt_lock);
1283
1284         if (pl_built) {
1285                 if (claim_availrmem) {
1286                         mutex_enter(&freemem_lock);
1287                         availrmem += claim_availrmem;
1288                         mutex_exit(&freemem_lock);
1289                 }
1290
1291                 /*
1292                  * We created pl and we need to destroy it.
1293                  */
1294                 pplist = pl;
1295                 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1296                         if (pplist[an_idx] != NULL)
1297                                 page_unlock(pplist[an_idx]);
1298                 }
1299                 kmem_free(pl, sizeof (page_t *) * tot_npages);
1300         }
1301
1302         if (shmd->shm_softlockcnt <= 0) {
1303                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1304                         mutex_enter(&seg->s_as->a_contents);
1305                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1306                                 AS_CLRUNMAPWAIT(seg->s_as);
1307                                 cv_broadcast(&seg->s_as->a_cv);
1308                         }
1309                         mutex_exit(&seg->s_as->a_contents);
1310                 }
1311         }
1312         *ppp = NULL;
1313         return (ret);
1314 }
1315
1316
1317
1318 /*
1319  * return locked pages over a given range.
1320  *
1321  * We will cache the entire ISM segment and save the pplist for the
1322  * entire segment in the ppa field of the underlying ISM segment structure.
1323  * Later, during a call to segspt_reclaim() we will use this ppa array
1324  * to page_unlock() all of the pages and then we will free this ppa list.
1325  */
1326 /*ARGSUSED*/
1327 static int
1328 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1329     struct page ***ppp, enum lock_type type, enum seg_rw rw)
1330 {
1331         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1332         struct seg      *sptseg = shmd->shm_sptseg;
1333         struct spt_data *sptd = sptseg->s_data;
1334         pgcnt_t np, page_index, npages;
1335         caddr_t a, spt_base;
1336         struct page **pplist, **pl, *pp;
1337         struct anon_map *amp;
1338         ulong_t anon_index;
1339         int ret = ENOTSUP;
1340         uint_t  pl_built = 0;
1341         struct anon *ap;
1342         struct vnode *vp;
1343         u_offset_t off;
1344
1345         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1346         ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1347
1348
1349         /*
1350          * We want to lock/unlock the entire ISM segment. Therefore,
1351          * we will be using the underlying sptseg and it's base address
1352          * and length for the caching arguments.
1353          */
1354         ASSERT(sptseg);
1355         ASSERT(sptd);
1356
1357         if (sptd->spt_flags & SHM_PAGEABLE) {
1358                 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1359         }
1360
1361         page_index = seg_page(seg, addr);
1362         npages = btopr(len);
1363
1364         /*
1365          * check if the request is larger than number of pages covered
1366          * by amp
1367          */
1368         if (page_index + npages > btopr(sptd->spt_amp->size)) {
1369                 *ppp = NULL;
1370                 return (ENOTSUP);
1371         }
1372
1373         if (type == L_PAGEUNLOCK) {
1374
1375                 ASSERT(sptd->spt_ppa != NULL);
1376
1377                 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1378                     sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1379
1380                 /*
1381                  * If someone is blocked while unmapping, we purge
1382                  * segment page cache and thus reclaim pplist synchronously
1383                  * without waiting for seg_pasync_thread. This speeds up
1384                  * unmapping in cases where munmap(2) is called, while
1385                  * raw async i/o is still in progress or where a thread
1386                  * exits on data fault in a multithreaded application.
1387                  */
1388                 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1389                         segspt_purge(seg);
1390                 }
1391                 return (0);
1392         }
1393
1394         /* The L_PAGELOCK case... */
1395
1396         /*
1397          * First try to find pages in segment page cache, without
1398          * holding the segment lock.
1399          */
1400         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1401             S_WRITE, SEGP_FORCE_WIRED);
1402         if (pplist != NULL) {
1403                 ASSERT(sptd->spt_ppa == pplist);
1404                 ASSERT(sptd->spt_ppa[page_index]);
1405                 /*
1406                  * Since we cache the entire ISM segment, we want to
1407                  * set ppp to point to the first slot that corresponds
1408                  * to the requested addr, i.e. page_index.
1409                  */
1410                 *ppp = &(sptd->spt_ppa[page_index]);
1411                 return (0);
1412         }
1413
1414         mutex_enter(&sptd->spt_lock);
1415
1416         /*
1417          * try to find pages in segment page cache
1418          */
1419         pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1420             S_WRITE, SEGP_FORCE_WIRED);
1421         if (pplist != NULL) {
1422                 ASSERT(sptd->spt_ppa == pplist);
1423                 /*
1424                  * Since we cache the entire segment, we want to
1425                  * set ppp to point to the first slot that corresponds
1426                  * to the requested addr, i.e. page_index.
1427                  */
1428                 mutex_exit(&sptd->spt_lock);
1429                 *ppp = &(sptd->spt_ppa[page_index]);
1430                 return (0);
1431         }
1432
1433         if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1434             SEGP_FORCE_WIRED) == SEGP_FAIL) {
1435                 mutex_exit(&sptd->spt_lock);
1436                 *ppp = NULL;
1437                 return (ENOTSUP);
1438         }
1439
1440         /*
1441          * No need to worry about protections because ISM pages
1442          * are always rw.
1443          */
1444         pl = pplist = NULL;
1445
1446         /*
1447          * Do we need to build the ppa array?
1448          */
1449         if (sptd->spt_ppa == NULL) {
1450                 ASSERT(sptd->spt_ppa == pplist);
1451
1452                 spt_base = sptseg->s_base;
1453                 pl_built = 1;
1454
1455                 /*
1456                  * availrmem is decremented once during anon_swap_adjust()
1457                  * and is incremented during the anon_unresv(), which is
1458                  * called from shm_rm_amp() when the segment is destroyed.
1459                  */
1460                 amp = sptd->spt_amp;
1461                 ASSERT(amp != NULL);
1462
1463                 /* pcachecnt is protected by sptd->spt_lock */
1464                 ASSERT(sptd->spt_pcachecnt == 0);
1465                 pplist = kmem_zalloc(sizeof (page_t *)
1466                     * btopr(sptd->spt_amp->size), KM_SLEEP);
1467                 pl = pplist;
1468
1469                 anon_index = seg_page(sptseg, spt_base);
1470
1471                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
1472                 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1473                     a += PAGESIZE, anon_index++, pplist++) {
1474                         ap = anon_get_ptr(amp->ahp, anon_index);
1475                         ASSERT(ap != NULL);
1476                         swap_xlate(ap, &vp, &off);
1477                         pp = page_lookup(vp, off, SE_SHARED);
1478                         ASSERT(pp != NULL);
1479                         *pplist = pp;
1480                 }
1481                 ANON_LOCK_EXIT(&amp->a_rwlock);
1482
1483                 if (a < (spt_base + sptd->spt_amp->size)) {
1484                         ret = ENOTSUP;
1485                         goto insert_fail;
1486                 }
1487                 sptd->spt_ppa = pl;
1488         } else {
1489                 /*
1490                  * We already have a valid ppa[].
1491                  */
1492                 pl = sptd->spt_ppa;
1493         }
1494
1495         ASSERT(pl != NULL);
1496
1497         ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1498             sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1499             segspt_reclaim);
1500         if (ret == SEGP_FAIL) {
1501                 /*
1502                  * seg_pinsert failed. We return
1503                  * ENOTSUP, so that the as_pagelock() code will
1504                  * then try the slower F_SOFTLOCK path.
1505                  */
1506                 if (pl_built) {
1507                         /*
1508                          * No one else has referenced the ppa[].
1509                          * We created it and we need to destroy it.
1510                          */
1511                         sptd->spt_ppa = NULL;
1512                 }
1513                 ret = ENOTSUP;
1514                 goto insert_fail;
1515         }
1516
1517         /*
1518          * In either case, we increment softlockcnt on the 'real' segment.
1519          */
1520         sptd->spt_pcachecnt++;
1521         atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1522
1523         /*
1524          * We can now drop the sptd->spt_lock since the ppa[]
1525          * exists and we have incremented pacachecnt.
1526          */
1527         mutex_exit(&sptd->spt_lock);
1528
1529         /*
1530          * Since we cache the entire segment, we want to
1531          * set ppp to point to the first slot that corresponds
1532          * to the requested addr, i.e. page_index.
1533          */
1534         *ppp = &(sptd->spt_ppa[page_index]);
1535         return (0);
1536
1537 insert_fail:
1538         /*
1539          * We will only reach this code if we tried and failed.
1540          *
1541          * And we can drop the lock on the dummy seg, once we've failed
1542          * to set up a new ppa[].
1543          */
1544         mutex_exit(&sptd->spt_lock);
1545
1546         if (pl_built) {
1547                 /*
1548                  * We created pl and we need to destroy it.
1549                  */
1550                 pplist = pl;
1551                 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1552                 while (np) {
1553                         page_unlock(*pplist);
1554                         np--;
1555                         pplist++;
1556                 }
1557                 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1558         }
1559         if (shmd->shm_softlockcnt <= 0) {
1560                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1561                         mutex_enter(&seg->s_as->a_contents);
1562                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1563                                 AS_CLRUNMAPWAIT(seg->s_as);
1564                                 cv_broadcast(&seg->s_as->a_cv);
1565                         }
1566                         mutex_exit(&seg->s_as->a_contents);
1567                 }
1568         }
1569         *ppp = NULL;
1570         return (ret);
1571 }
1572
1573 /*
1574  * purge any cached pages in the I/O page cache
1575  */
1576 static void
1577 segspt_purge(struct seg *seg)
1578 {
1579         seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1580 }
1581
1582 static int
1583 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1584     enum seg_rw rw, int async)
1585 {
1586         struct seg *seg = (struct seg *)ptag;
1587         struct  shm_data *shmd = (struct shm_data *)seg->s_data;
1588         struct  seg     *sptseg;
1589         struct  spt_data *sptd;
1590         pgcnt_t npages, i, free_availrmem = 0;
1591         int     done = 0;
1592
1593 #ifdef lint
1594         addr = addr;
1595 #endif
1596         sptseg = shmd->shm_sptseg;
1597         sptd = sptseg->s_data;
1598         npages = (len >> PAGESHIFT);
1599         ASSERT(npages);
1600         ASSERT(sptd->spt_pcachecnt != 0);
1601         ASSERT(sptd->spt_ppa == pplist);
1602         ASSERT(npages == btopr(sptd->spt_amp->size));
1603         ASSERT(async || AS_LOCK_HELD(seg->s_as));
1604
1605         /*
1606          * Acquire the lock on the dummy seg and destroy the
1607          * ppa array IF this is the last pcachecnt.
1608          */
1609         mutex_enter(&sptd->spt_lock);
1610         if (--sptd->spt_pcachecnt == 0) {
1611                 for (i = 0; i < npages; i++) {
1612                         if (pplist[i] == NULL) {
1613                                 continue;
1614                         }
1615                         if (rw == S_WRITE) {
1616                                 hat_setrefmod(pplist[i]);
1617                         } else {
1618                                 hat_setref(pplist[i]);
1619                         }
1620                         if ((sptd->spt_flags & SHM_PAGEABLE) &&
1621                             (sptd->spt_ppa_lckcnt[i] == 0))
1622                                 free_availrmem++;
1623                         page_unlock(pplist[i]);
1624                 }
1625                 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1626                         mutex_enter(&freemem_lock);
1627                         availrmem += free_availrmem;
1628                         mutex_exit(&freemem_lock);
1629                 }
1630                 /*
1631                  * Since we want to cach/uncache the entire ISM segment,
1632                  * we will track the pplist in a segspt specific field
1633                  * ppa, that is initialized at the time we add an entry to
1634                  * the cache.
1635                  */
1636                 ASSERT(sptd->spt_pcachecnt == 0);
1637                 kmem_free(pplist, sizeof (page_t *) * npages);
1638                 sptd->spt_ppa = NULL;
1639                 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1640                 sptd->spt_gen++;
1641                 cv_broadcast(&sptd->spt_cv);
1642                 done = 1;
1643         }
1644         mutex_exit(&sptd->spt_lock);
1645
1646         /*
1647          * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1648          * may not hold AS lock (in this case async argument is not 0). This
1649          * means if softlockcnt drops to 0 after the decrement below address
1650          * space may get freed. We can't allow it since after softlock
1651          * derement to 0 we still need to access as structure for possible
1652          * wakeup of unmap waiters. To prevent the disappearance of as we take
1653          * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1654          * this mutex as a barrier to make sure this routine completes before
1655          * segment is freed.
1656          *
1657          * The second complication we have to deal with in async case is a
1658          * possibility of missed wake up of unmap wait thread. When we don't
1659          * hold as lock here we may take a_contents lock before unmap wait
1660          * thread that was first to see softlockcnt was still not 0. As a
1661          * result we'll fail to wake up an unmap wait thread. To avoid this
1662          * race we set nounmapwait flag in as structure if we drop softlockcnt
1663          * to 0 if async is not 0.  unmapwait thread
1664          * will not block if this flag is set.
1665          */
1666         if (async)
1667                 mutex_enter(&shmd->shm_segfree_syncmtx);
1668
1669         /*
1670          * Now decrement softlockcnt.
1671          */
1672         ASSERT(shmd->shm_softlockcnt > 0);
1673         atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1674
1675         if (shmd->shm_softlockcnt <= 0) {
1676                 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1677                         mutex_enter(&seg->s_as->a_contents);
1678                         if (async)
1679                                 AS_SETNOUNMAPWAIT(seg->s_as);
1680                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1681                                 AS_CLRUNMAPWAIT(seg->s_as);
1682                                 cv_broadcast(&seg->s_as->a_cv);
1683                         }
1684                         mutex_exit(&seg->s_as->a_contents);
1685                 }
1686         }
1687
1688         if (async)
1689                 mutex_exit(&shmd->shm_segfree_syncmtx);
1690
1691         return (done);
1692 }
1693
1694 /*
1695  * Do a F_SOFTUNLOCK call over the range requested.
1696  * The range must have already been F_SOFTLOCK'ed.
1697  *
1698  * The calls to acquire and release the anon map lock mutex were
1699  * removed in order to avoid a deadly embrace during a DR
1700  * memory delete operation.  (Eg. DR blocks while waiting for a
1701  * exclusive lock on a page that is being used for kaio; the
1702  * thread that will complete the kaio and call segspt_softunlock
1703  * blocks on the anon map lock; another thread holding the anon
1704  * map lock blocks on another page lock via the segspt_shmfault
1705  * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1706  *
1707  * The appropriateness of the removal is based upon the following:
1708  * 1. If we are holding a segment's reader lock and the page is held
1709  * shared, then the corresponding element in anonmap which points to
1710  * anon struct cannot change and there is no need to acquire the
1711  * anonymous map lock.
1712  * 2. Threads in segspt_softunlock have a reader lock on the segment
1713  * and already have the shared page lock, so we are guaranteed that
1714  * the anon map slot cannot change and therefore can call anon_get_ptr()
1715  * without grabbing the anonymous map lock.
1716  * 3. Threads that softlock a shared page break copy-on-write, even if
1717  * its a read.  Thus cow faults can be ignored with respect to soft
1718  * unlocking, since the breaking of cow means that the anon slot(s) will
1719  * not be shared.
1720  */
1721 static void
1722 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1723     size_t len, enum seg_rw rw)
1724 {
1725         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1726         struct seg      *sptseg;
1727         struct spt_data *sptd;
1728         page_t *pp;
1729         caddr_t adr;
1730         struct vnode *vp;
1731         u_offset_t offset;
1732         ulong_t anon_index;
1733         struct anon_map *amp;           /* XXX - for locknest */
1734         struct anon *ap = NULL;
1735         pgcnt_t npages;
1736
1737         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1738
1739         sptseg = shmd->shm_sptseg;
1740         sptd = sptseg->s_data;
1741
1742         /*
1743          * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1744          * and therefore their pages are SE_SHARED locked
1745          * for the entire life of the segment.
1746          */
1747         if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1748             ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1749                 goto softlock_decrement;
1750         }
1751
1752         /*
1753          * Any thread is free to do a page_find and
1754          * page_unlock() on the pages within this seg.
1755          *
1756          * We are already holding the as->a_lock on the user's
1757          * real segment, but we need to hold the a_lock on the
1758          * underlying dummy as. This is mostly to satisfy the
1759          * underlying HAT layer.
1760          */
1761         AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1762         hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1763         AS_LOCK_EXIT(sptseg->s_as);
1764
1765         amp = sptd->spt_amp;
1766         ASSERT(amp != NULL);
1767         anon_index = seg_page(sptseg, sptseg_addr);
1768
1769         for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1770                 ap = anon_get_ptr(amp->ahp, anon_index++);
1771                 ASSERT(ap != NULL);
1772                 swap_xlate(ap, &vp, &offset);
1773
1774                 /*
1775                  * Use page_find() instead of page_lookup() to
1776                  * find the page since we know that it has a
1777                  * "shared" lock.
1778                  */
1779                 pp = page_find(vp, offset);
1780                 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1781                 if (pp == NULL) {
1782                         panic("segspt_softunlock: "
1783                             "addr %p, ap %p, vp %p, off %llx",
1784                             (void *)adr, (void *)ap, (void *)vp, offset);
1785                         /*NOTREACHED*/
1786                 }
1787
1788                 if (rw == S_WRITE) {
1789                         hat_setrefmod(pp);
1790                 } else if (rw != S_OTHER) {
1791                         hat_setref(pp);
1792                 }
1793                 page_unlock(pp);
1794         }
1795
1796 softlock_decrement:
1797         npages = btopr(len);
1798         ASSERT(shmd->shm_softlockcnt >= npages);
1799         atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1800         if (shmd->shm_softlockcnt == 0) {
1801                 /*
1802                  * All SOFTLOCKS are gone. Wakeup any waiting
1803                  * unmappers so they can try again to unmap.
1804                  * Check for waiters first without the mutex
1805                  * held so we don't always grab the mutex on
1806                  * softunlocks.
1807                  */
1808                 if (AS_ISUNMAPWAIT(seg->s_as)) {
1809                         mutex_enter(&seg->s_as->a_contents);
1810                         if (AS_ISUNMAPWAIT(seg->s_as)) {
1811                                 AS_CLRUNMAPWAIT(seg->s_as);
1812                                 cv_broadcast(&seg->s_as->a_cv);
1813                         }
1814                         mutex_exit(&seg->s_as->a_contents);
1815                 }
1816         }
1817 }
1818
1819 int
1820 segspt_shmattach(struct seg **segpp, void *argsp)
1821 {
1822         struct seg *seg = *segpp;
1823         struct shm_data *shmd_arg = (struct shm_data *)argsp;
1824         struct shm_data *shmd;
1825         struct anon_map *shm_amp = shmd_arg->shm_amp;
1826         struct spt_data *sptd;
1827         int error = 0;
1828
1829         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1830
1831         shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1832         if (shmd == NULL)
1833                 return (ENOMEM);
1834
1835         shmd->shm_sptas = shmd_arg->shm_sptas;
1836         shmd->shm_amp = shm_amp;
1837         shmd->shm_sptseg = shmd_arg->shm_sptseg;
1838
1839         (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1840             NULL, 0, seg->s_size);
1841
1842         mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1843
1844         seg->s_data = (void *)shmd;
1845         seg->s_ops = &segspt_shmops;
1846         seg->s_szc = shmd->shm_sptseg->s_szc;
1847         sptd = shmd->shm_sptseg->s_data;
1848
1849         if (sptd->spt_flags & SHM_PAGEABLE) {
1850                 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1851                     KM_NOSLEEP)) == NULL) {
1852                         seg->s_data = (void *)NULL;
1853                         kmem_free(shmd, (sizeof (*shmd)));
1854                         return (ENOMEM);
1855                 }
1856                 shmd->shm_lckpgs = 0;
1857                 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1858                         if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1859                             shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1860                             seg->s_size, seg->s_szc)) != 0) {
1861                                 kmem_free(shmd->shm_vpage,
1862                                     btopr(shm_amp->size));
1863                         }
1864                 }
1865         } else {
1866                 error = hat_share(seg->s_as->a_hat, seg->s_base,
1867                     shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1868                     seg->s_size, seg->s_szc);
1869         }
1870         if (error) {
1871                 seg->s_szc = 0;
1872                 seg->s_data = (void *)NULL;
1873                 kmem_free(shmd, (sizeof (*shmd)));
1874         } else {
1875                 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1876                 shm_amp->refcnt++;
1877                 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1878         }
1879         return (error);
1880 }
1881
1882 int
1883 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1884 {
1885         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1886         int reclaim = 1;
1887
1888         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1889 retry:
1890         if (shmd->shm_softlockcnt > 0) {
1891                 if (reclaim == 1) {
1892                         segspt_purge(seg);
1893                         reclaim = 0;
1894                         goto retry;
1895                 }
1896                 return (EAGAIN);
1897         }
1898
1899         if (ssize != seg->s_size) {
1900 #ifdef DEBUG
1901                 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1902                     ssize, seg->s_size);
1903 #endif
1904                 return (EINVAL);
1905         }
1906
1907         (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1908             NULL, 0);
1909         hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1910
1911         seg_free(seg);
1912
1913         return (0);
1914 }
1915
1916 void
1917 segspt_shmfree(struct seg *seg)
1918 {
1919         struct shm_data *shmd = (struct shm_data *)seg->s_data;
1920         struct anon_map *shm_amp = shmd->shm_amp;
1921
1922         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1923
1924         (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1925             MC_UNLOCK, NULL, 0);
1926
1927         /*
1928          * Need to increment refcnt when attaching
1929          * and decrement when detaching because of dup().
1930          */
1931         ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1932         shm_amp->refcnt--;
1933         ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1934
1935         if (shmd->shm_vpage) {  /* only for DISM */
1936                 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1937                 shmd->shm_vpage = NULL;
1938         }
1939
1940         /*
1941          * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1942          * still working with this segment without holding as lock.
1943          */
1944         ASSERT(shmd->shm_softlockcnt == 0);
1945         mutex_enter(&shmd->shm_segfree_syncmtx);
1946         mutex_destroy(&shmd->shm_segfree_syncmtx);
1947
1948         kmem_free(shmd, sizeof (*shmd));
1949 }
1950
1951 /*ARGSUSED*/
1952 int
1953 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1954 {
1955         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1956
1957         /*
1958          * Shared page table is more than shared mapping.
1959          *  Individual process sharing page tables can't change prot
1960          *  because there is only one set of page tables.
1961          *  This will be allowed after private page table is
1962          *  supported.
1963          */
1964 /* need to return correct status error? */
1965         return (0);
1966 }
1967
1968
1969 faultcode_t
1970 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1971     size_t len, enum fault_type type, enum seg_rw rw)
1972 {
1973         struct  shm_data        *shmd = (struct shm_data *)seg->s_data;
1974         struct  seg             *sptseg = shmd->shm_sptseg;
1975         struct  as              *curspt = shmd->shm_sptas;
1976         struct  spt_data        *sptd = sptseg->s_data;
1977         pgcnt_t npages;
1978         size_t  size;
1979         caddr_t segspt_addr, shm_addr;
1980         page_t  **ppa;
1981         int     i;
1982         ulong_t an_idx = 0;
1983         int     err = 0;
1984         int     dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1985         size_t  pgsz;
1986         pgcnt_t pgcnt;
1987         caddr_t a;
1988         pgcnt_t pidx;
1989
1990 #ifdef lint
1991         hat = hat;
1992 #endif
1993         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1994
1995         /*
1996          * Because of the way spt is implemented
1997          * the realsize of the segment does not have to be
1998          * equal to the segment size itself. The segment size is
1999          * often in multiples of a page size larger than PAGESIZE.
2000          * The realsize is rounded up to the nearest PAGESIZE
2001          * based on what the user requested. This is a bit of
2002          * ungliness that is historical but not easily fixed
2003          * without re-designing the higher levels of ISM.
2004          */
2005         ASSERT(addr >= seg->s_base);
2006         if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2007                 return (FC_NOMAP);
2008         /*
2009          * For all of the following cases except F_PROT, we need to
2010          * make any necessary adjustments to addr and len
2011          * and get all of the necessary page_t's into an array called ppa[].
2012          *
2013          * The code in shmat() forces base addr and len of ISM segment
2014          * to be aligned to largest page size supported. Therefore,
2015          * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2016          * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2017          * in large pagesize chunks, or else we will screw up the HAT
2018          * layer by calling hat_memload_array() with differing page sizes
2019          * over a given virtual range.
2020          */
2021         pgsz = page_get_pagesize(sptseg->s_szc);
2022         pgcnt = page_get_pagecnt(sptseg->s_szc);
2023         shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2024         size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2025         npages = btopr(size);
2026
2027         /*
2028          * Now we need to convert from addr in segshm to addr in segspt.
2029          */
2030         an_idx = seg_page(seg, shm_addr);
2031         segspt_addr = sptseg->s_base + ptob(an_idx);
2032
2033         ASSERT((segspt_addr + ptob(npages)) <=
2034             (sptseg->s_base + sptd->spt_realsize));
2035         ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
2036
2037         switch (type) {
2038
2039         case F_SOFTLOCK:
2040
2041                 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2042                 /*
2043                  * Fall through to the F_INVAL case to load up the hat layer
2044                  * entries with the HAT_LOAD_LOCK flag.
2045                  */
2046                 /* FALLTHRU */
2047         case F_INVAL:
2048
2049                 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2050                         return (FC_NOMAP);
2051
2052                 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
2053
2054                 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
2055                 if (err != 0) {
2056                         if (type == F_SOFTLOCK) {
2057                                 atomic_add_long((ulong_t *)(
2058                                     &(shmd->shm_softlockcnt)), -npages);
2059                         }
2060                         goto dism_err;
2061                 }
2062                 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2063                 a = segspt_addr;
2064                 pidx = 0;
2065                 if (type == F_SOFTLOCK) {
2066
2067                         /*
2068                          * Load up the translation keeping it
2069                          * locked and don't unlock the page.
2070                          */
2071                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2072                                 hat_memload_array(sptseg->s_as->a_hat,
2073                                     a, pgsz, &ppa[pidx], sptd->spt_prot,
2074                                     HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2075                         }
2076                 } else {
2077                         /*
2078                          * Migrate pages marked for migration
2079                          */
2080                         if (lgrp_optimizations())
2081                                 page_migrate(seg, shm_addr, ppa, npages);
2082
2083                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2084                                 hat_memload_array(sptseg->s_as->a_hat,
2085                                     a, pgsz, &ppa[pidx],
2086                                     sptd->spt_prot,
2087                                     HAT_LOAD_SHARE);
2088                         }
2089
2090                         /*
2091                          * And now drop the SE_SHARED lock(s).
2092                          */
2093                         if (dyn_ism_unmap) {
2094                                 for (i = 0; i < npages; i++) {
2095                                         page_unlock(ppa[i]);
2096                                 }
2097                         }
2098                 }
2099
2100                 if (!dyn_ism_unmap) {
2101                         if (hat_share(seg->s_as->a_hat, shm_addr,
2102                             curspt->a_hat, segspt_addr, ptob(npages),
2103                             seg->s_szc) != 0) {
2104                                 panic("hat_share err in DISM fault");
2105                                 /* NOTREACHED */
2106                         }
2107                         if (type == F_INVAL) {
2108                                 for (i = 0; i < npages; i++) {
2109                                         page_unlock(ppa[i]);
2110                                 }
2111                         }
2112                 }
2113                 AS_LOCK_EXIT(sptseg->s_as);
2114 dism_err:
2115                 kmem_free(ppa, npages * sizeof (page_t *));
2116                 return (err);
2117
2118         case F_SOFTUNLOCK:
2119
2120                 /*
2121                  * This is a bit ugly, we pass in the real seg pointer,
2122                  * but the segspt_addr is the virtual address within the
2123                  * dummy seg.
2124                  */
2125                 segspt_softunlock(seg, segspt_addr, size, rw);
2126                 return (0);
2127
2128         case F_PROT:
2129
2130                 /*
2131                  * This takes care of the unusual case where a user
2132                  * allocates a stack in shared memory and a register
2133                  * window overflow is written to that stack page before
2134                  * it is otherwise modified.
2135                  *
2136                  * We can get away with this because ISM segments are
2137                  * always rw. Other than this unusual case, there
2138                  * should be no instances of protection violations.
2139                  */
2140                 return (0);
2141
2142         default:
2143 #ifdef DEBUG
2144                 panic("segspt_dismfault default type?");
2145 #else
2146                 return (FC_NOMAP);
2147 #endif
2148         }
2149 }
2150
2151
2152 faultcode_t
2153 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2154     size_t len, enum fault_type type, enum seg_rw rw)
2155 {
2156         struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2157         struct seg              *sptseg = shmd->shm_sptseg;
2158         struct as               *curspt = shmd->shm_sptas;
2159         struct spt_data         *sptd = sptseg->s_data;
2160         pgcnt_t npages;
2161         size_t size;
2162         caddr_t sptseg_addr, shm_addr;
2163         page_t *pp, **ppa;
2164         int     i;
2165         u_offset_t offset;
2166         ulong_t anon_index = 0;
2167         struct vnode *vp;
2168         struct anon_map *amp;           /* XXX - for locknest */
2169         struct anon *ap = NULL;
2170         size_t          pgsz;
2171         pgcnt_t         pgcnt;
2172         caddr_t         a;
2173         pgcnt_t         pidx;
2174         size_t          sz;
2175
2176 #ifdef lint
2177         hat = hat;
2178 #endif
2179
2180         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2181
2182         if (sptd->spt_flags & SHM_PAGEABLE) {
2183                 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2184         }
2185
2186         /*
2187          * Because of the way spt is implemented
2188          * the realsize of the segment does not have to be
2189          * equal to the segment size itself. The segment size is
2190          * often in multiples of a page size larger than PAGESIZE.
2191          * The realsize is rounded up to the nearest PAGESIZE
2192          * based on what the user requested. This is a bit of
2193          * ungliness that is historical but not easily fixed
2194          * without re-designing the higher levels of ISM.
2195          */
2196         ASSERT(addr >= seg->s_base);
2197         if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2198                 return (FC_NOMAP);
2199         /*
2200          * For all of the following cases except F_PROT, we need to
2201          * make any necessary adjustments to addr and len
2202          * and get all of the necessary page_t's into an array called ppa[].
2203          *
2204          * The code in shmat() forces base addr and len of ISM segment
2205          * to be aligned to largest page size supported. Therefore,
2206          * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2207          * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2208          * in large pagesize chunks, or else we will screw up the HAT
2209          * layer by calling hat_memload_array() with differing page sizes
2210          * over a given virtual range.
2211          */
2212         pgsz = page_get_pagesize(sptseg->s_szc);
2213         pgcnt = page_get_pagecnt(sptseg->s_szc);
2214         shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2215         size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2216         npages = btopr(size);
2217
2218         /*
2219          * Now we need to convert from addr in segshm to addr in segspt.
2220          */
2221         anon_index = seg_page(seg, shm_addr);
2222         sptseg_addr = sptseg->s_base + ptob(anon_index);
2223
2224         /*
2225          * And now we may have to adjust npages downward if we have
2226          * exceeded the realsize of the segment or initial anon
2227          * allocations.
2228          */
2229         if ((sptseg_addr + ptob(npages)) >
2230             (sptseg->s_base + sptd->spt_realsize))
2231                 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2232
2233         npages = btopr(size);
2234
2235         ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2236         ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2237
2238         switch (type) {
2239
2240         case F_SOFTLOCK:
2241
2242                 /*
2243                  * availrmem is decremented once during anon_swap_adjust()
2244                  * and is incremented during the anon_unresv(), which is
2245                  * called from shm_rm_amp() when the segment is destroyed.
2246                  */
2247                 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2248                 /*
2249                  * Some platforms assume that ISM pages are SE_SHARED
2250                  * locked for the entire life of the segment.
2251                  */
2252                 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2253                         return (0);
2254                 /*
2255                  * Fall through to the F_INVAL case to load up the hat layer
2256                  * entries with the HAT_LOAD_LOCK flag.
2257                  */
2258
2259                 /* FALLTHRU */
2260         case F_INVAL:
2261
2262                 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2263                         return (FC_NOMAP);
2264
2265                 /*
2266                  * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2267                  * may still rely on this call to hat_share(). That
2268                  * would imply that those hat's can fault on a
2269                  * HAT_LOAD_LOCK translation, which would seem
2270                  * contradictory.
2271                  */
2272                 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2273                         if (hat_share(seg->s_as->a_hat, seg->s_base,
2274                             curspt->a_hat, sptseg->s_base,
2275                             sptseg->s_size, sptseg->s_szc) != 0) {
2276                                 panic("hat_share error in ISM fault");
2277                                 /*NOTREACHED*/
2278                         }
2279                         return (0);
2280                 }
2281                 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2282
2283                 /*
2284                  * I see no need to lock the real seg,
2285                  * here, because all of our work will be on the underlying
2286                  * dummy seg.
2287                  *
2288                  * sptseg_addr and npages now account for large pages.
2289                  */
2290                 amp = sptd->spt_amp;
2291                 ASSERT(amp != NULL);
2292                 anon_index = seg_page(sptseg, sptseg_addr);
2293
2294                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2295                 for (i = 0; i < npages; i++) {
2296                         ap = anon_get_ptr(amp->ahp, anon_index++);
2297                         ASSERT(ap != NULL);
2298                         swap_xlate(ap, &vp, &offset);
2299                         pp = page_lookup(vp, offset, SE_SHARED);
2300                         ASSERT(pp != NULL);
2301                         ppa[i] = pp;
2302                 }
2303                 ANON_LOCK_EXIT(&amp->a_rwlock);
2304                 ASSERT(i == npages);
2305
2306                 /*
2307                  * We are already holding the as->a_lock on the user's
2308                  * real segment, but we need to hold the a_lock on the
2309                  * underlying dummy as. This is mostly to satisfy the
2310                  * underlying HAT layer.
2311                  */
2312                 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2313                 a = sptseg_addr;
2314                 pidx = 0;
2315                 if (type == F_SOFTLOCK) {
2316                         /*
2317                          * Load up the translation keeping it
2318                          * locked and don't unlock the page.
2319                          */
2320                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2321                                 sz = MIN(pgsz, ptob(npages - pidx));
2322                                 hat_memload_array(sptseg->s_as->a_hat, a,
2323                                     sz, &ppa[pidx], sptd->spt_prot,
2324                                     HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2325                         }
2326                 } else {
2327                         /*
2328                          * Migrate pages marked for migration.
2329                          */
2330                         if (lgrp_optimizations())
2331                                 page_migrate(seg, shm_addr, ppa, npages);
2332
2333                         for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2334                                 sz = MIN(pgsz, ptob(npages - pidx));
2335                                 hat_memload_array(sptseg->s_as->a_hat,
2336                                     a, sz, &ppa[pidx],
2337                                     sptd->spt_prot, HAT_LOAD_SHARE);
2338                         }
2339
2340                         /*
2341                          * And now drop the SE_SHARED lock(s).
2342                          */
2343                         for (i = 0; i < npages; i++)
2344                                 page_unlock(ppa[i]);
2345                 }
2346                 AS_LOCK_EXIT(sptseg->s_as);
2347
2348                 kmem_free(ppa, sizeof (page_t *) * npages);
2349                 return (0);
2350         case F_SOFTUNLOCK:
2351
2352                 /*
2353                  * This is a bit ugly, we pass in the real seg pointer,
2354                  * but the sptseg_addr is the virtual address within the
2355                  * dummy seg.
2356                  */
2357                 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2358                 return (0);
2359
2360         case F_PROT:
2361
2362                 /*
2363                  * This takes care of the unusual case where a user
2364                  * allocates a stack in shared memory and a register
2365                  * window overflow is written to that stack page before
2366                  * it is otherwise modified.
2367                  *
2368                  * We can get away with this because ISM segments are
2369                  * always rw. Other than this unusual case, there
2370                  * should be no instances of protection violations.
2371                  */
2372                 return (0);
2373
2374         default:
2375 #ifdef DEBUG
2376                 cmn_err(CE_WARN, "segspt_shmfault default type?");
2377 #endif
2378                 return (FC_NOMAP);
2379         }
2380 }
2381
2382 /*ARGSUSED*/
2383 static faultcode_t
2384 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2385 {
2386         return (0);
2387 }
2388
2389 /*ARGSUSED*/
2390 static int
2391 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2392 {
2393         return (0);
2394 }
2395
2396 /*ARGSUSED*/
2397 static size_t
2398 segspt_shmswapout(struct seg *seg)
2399 {
2400         return (0);
2401 }
2402
2403 /*
2404  * duplicate the shared page tables
2405  */
2406 int
2407 segspt_shmdup(struct seg *seg, struct seg *newseg)
2408 {
2409         struct shm_data         *shmd = (struct shm_data *)seg->s_data;
2410         struct anon_map         *amp = shmd->shm_amp;
2411         struct shm_data         *shmd_new;
2412         struct seg              *spt_seg = shmd->shm_sptseg;
2413         struct spt_data         *sptd = spt_seg->s_data;
2414         int                     error = 0;
2415
2416         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2417
2418         shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2419         newseg->s_data = (void *)shmd_new;
2420         shmd_new->shm_sptas = shmd->shm_sptas;
2421         shmd_new->shm_amp = amp;
2422         shmd_new->shm_sptseg = shmd->shm_sptseg;
2423         newseg->s_ops = &segspt_shmops;
2424         newseg->s_szc = seg->s_szc;
2425         ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2426
2427         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2428         amp->refcnt++;
2429         ANON_LOCK_EXIT(&amp->a_rwlock);
2430
2431         if (sptd->spt_flags & SHM_PAGEABLE) {
2432                 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2433                 shmd_new->shm_lckpgs = 0;
2434                 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2435                         if ((error = hat_share(newseg->s_as->a_hat,
2436                             newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2437                             seg->s_size, seg->s_szc)) != 0) {
2438                                 kmem_free(shmd_new->shm_vpage,
2439                                     btopr(amp->size));
2440                         }
2441                 }
2442                 return (error);
2443         } else {
2444                 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2445                     shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2446                     seg->s_szc));
2447
2448         }
2449 }
2450
2451 /*ARGSUSED*/
2452 int
2453 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2454 {
2455         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2456         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2457
2458         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2459
2460         /*
2461          * ISM segment is always rw.
2462          */
2463         return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2464 }
2465
2466 /*
2467  * Return an array of locked large pages, for empty slots allocate
2468  * private zero-filled anon pages.
2469  */
2470 static int
2471 spt_anon_getpages(
2472         struct seg *sptseg,
2473         caddr_t sptaddr,
2474         size_t len,
2475         page_t *ppa[])
2476 {
2477         struct  spt_data *sptd = sptseg->s_data;
2478         struct  anon_map *amp = sptd->spt_amp;
2479         enum    seg_rw rw = sptd->spt_prot;
2480         uint_t  szc = sptseg->s_szc;
2481         size_t  pg_sz, share_sz = page_get_pagesize(szc);
2482         pgcnt_t lp_npgs;
2483         caddr_t lp_addr, e_sptaddr;
2484         uint_t  vpprot, ppa_szc = 0;
2485         struct  vpage *vpage = NULL;
2486         ulong_t j, ppa_idx;
2487         int     err, ierr = 0;
2488         pgcnt_t an_idx;
2489         anon_sync_obj_t cookie;
2490         int anon_locked = 0;
2491         pgcnt_t amp_pgs;
2492
2493
2494         ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2495         ASSERT(len != 0);
2496
2497         pg_sz = share_sz;
2498         lp_npgs = btop(pg_sz);
2499         lp_addr = sptaddr;
2500         e_sptaddr = sptaddr + len;
2501         an_idx = seg_page(sptseg, sptaddr);
2502         ppa_idx = 0;
2503
2504         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2505
2506         amp_pgs = page_get_pagecnt(amp->a_szc);
2507
2508         /*CONSTCOND*/
2509         while (1) {
2510                 for (; lp_addr < e_sptaddr;
2511                     an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2512
2513                         /*
2514                          * If we're currently locked, and we get to a new
2515                          * page, unlock our current anon chunk.
2516                          */
2517                         if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2518                                 anon_array_exit(&cookie);
2519                                 anon_locked = 0;
2520                         }
2521                         if (!anon_locked) {
2522                                 anon_array_enter(amp, an_idx, &cookie);
2523                                 anon_locked = 1;
2524                         }
2525                         ppa_szc = (uint_t)-1;
2526                         ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2527                             lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2528                             &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2529
2530                         if (ierr != 0) {
2531                                 if (ierr > 0) {
2532                                         err = FC_MAKE_ERR(ierr);
2533                                         goto lpgs_err;
2534                                 }
2535                                 break;
2536                         }
2537                 }
2538                 if (lp_addr == e_sptaddr) {
2539                         break;
2540                 }
2541                 ASSERT(lp_addr < e_sptaddr);
2542
2543                 /*
2544                  * ierr == -1 means we failed to allocate a large page.
2545                  * so do a size down operation.
2546                  *
2547                  * ierr == -2 means some other process that privately shares
2548                  * pages with this process has allocated a larger page and we
2549                  * need to retry with larger pages. So do a size up
2550                  * operation. This relies on the fact that large pages are
2551                  * never partially shared i.e. if we share any constituent
2552                  * page of a large page with another process we must share the
2553                  * entire large page. Note this cannot happen for SOFTLOCK
2554                  * case, unless current address (lpaddr) is at the beginning
2555                  * of the next page size boundary because the other process
2556                  * couldn't have relocated locked pages.
2557                  */
2558                 ASSERT(ierr == -1 || ierr == -2);
2559                 if (segvn_anypgsz) {
2560                         ASSERT(ierr == -2 || szc != 0);
2561                         ASSERT(ierr == -1 || szc < sptseg->s_szc);
2562                         szc = (ierr == -1) ? szc - 1 : szc + 1;
2563                 } else {
2564                         /*
2565                          * For faults and segvn_anypgsz == 0
2566                          * we need to be careful not to loop forever
2567                          * if existing page is found with szc other
2568                          * than 0 or seg->s_szc. This could be due
2569                          * to page relocations on behalf of DR or
2570                          * more likely large page creation. For this
2571                          * case simply re-size to existing page's szc
2572                          * if returned by anon_map_getpages().
2573                          */
2574                         if (ppa_szc == (uint_t)-1) {
2575                                 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2576                         } else {
2577                                 ASSERT(ppa_szc <= sptseg->s_szc);
2578                                 ASSERT(ierr == -2 || ppa_szc < szc);
2579                                 ASSERT(ierr == -1 || ppa_szc > szc);
2580                                 szc = ppa_szc;
2581                         }
2582                 }
2583                 pg_sz = page_get_pagesize(szc);
2584                 lp_npgs = btop(pg_sz);
2585                 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2586         }
2587         if (anon_locked) {
2588                 anon_array_exit(&cookie);
2589         }
2590         ANON_LOCK_EXIT(&amp->a_rwlock);
2591         return (0);
2592
2593 lpgs_err:
2594         if (anon_locked) {
2595                 anon_array_exit(&cookie);
2596         }
2597         ANON_LOCK_EXIT(&amp->a_rwlock);
2598         for (j = 0; j < ppa_idx; j++)
2599                 page_unlock(ppa[j]);
2600         return (err);
2601 }
2602
2603 /*
2604  * count the number of bytes in a set of spt pages that are currently not
2605  * locked
2606  */
2607 static rctl_qty_t
2608 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2609 {
2610         ulong_t i;
2611         rctl_qty_t unlocked = 0;
2612
2613         for (i = 0; i < npages; i++) {
2614                 if (ppa[i]->p_lckcnt == 0)
2615                         unlocked += PAGESIZE;
2616         }
2617         return (unlocked);
2618 }
2619
2620 extern  u_longlong_t randtick(void);
2621 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2622 #define NLCK    (NCPU_P2)
2623 /* Random number with a range [0, n-1], n must be power of two */
2624 #define RAND_P2(n)      \
2625         ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2626
2627 int
2628 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2629     page_t **ppa, ulong_t *lockmap, size_t pos,
2630     rctl_qty_t *locked)
2631 {
2632         struct  shm_data *shmd = seg->s_data;
2633         struct  spt_data *sptd = shmd->shm_sptseg->s_data;
2634         ulong_t i;
2635         int     kernel;
2636         pgcnt_t nlck = 0;
2637         int     rv = 0;
2638         int     use_reserved = 1;
2639
2640         /* return the number of bytes actually locked */
2641         *locked = 0;
2642
2643         /*
2644          * To avoid contention on freemem_lock, availrmem and pages_locked
2645          * global counters are updated only every nlck locked pages instead of
2646          * every time.  Reserve nlck locks up front and deduct from this
2647          * reservation for each page that requires a lock.  When the reservation
2648          * is consumed, reserve again.  nlck is randomized, so the competing
2649          * threads do not fall into a cyclic lock contention pattern. When
2650          * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2651          * is used to lock pages.
2652          */
2653         for (i = 0; i < npages; anon_index++, pos++, i++) {
2654                 if (nlck == 0 && use_reserved == 1) {
2655                         nlck = NLCK + RAND_P2(NLCK);
2656                         /* if fewer loops left, decrease nlck */
2657                         nlck = MIN(nlck, npages - i);
2658                         /*
2659                          * Reserve nlck locks up front and deduct from this
2660                          * reservation for each page that requires a lock.  When
2661                          * the reservation is consumed, reserve again.
2662                          */
2663                         mutex_enter(&freemem_lock);
2664                         if ((availrmem - nlck) < pages_pp_maximum) {
2665                                 /* Do not do advance memory reserves */
2666                                 use_reserved = 0;
2667                         } else {
2668                                 availrmem       -= nlck;
2669                                 pages_locked    += nlck;
2670                         }
2671                         mutex_exit(&freemem_lock);
2672                 }
2673                 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2674                         if (sptd->spt_ppa_lckcnt[anon_index] <
2675                             (ushort_t)DISM_LOCK_MAX) {
2676                                 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2677                                     (ushort_t)DISM_LOCK_MAX) {
2678                                         cmn_err(CE_WARN,
2679                                             "DISM page lock limit "
2680                                             "reached on DISM offset 0x%lx\n",
2681                                             anon_index << PAGESHIFT);
2682                                 }
2683                                 kernel = (sptd->spt_ppa &&
2684                                     sptd->spt_ppa[anon_index]);
2685                                 if (!page_pp_lock(ppa[i], 0, kernel ||
2686                                     use_reserved)) {
2687                                         sptd->spt_ppa_lckcnt[anon_index]--;
2688                                         rv = EAGAIN;
2689                                         break;
2690                                 }
2691                                 /* if this is a newly locked page, count it */
2692                                 if (ppa[i]->p_lckcnt == 1) {
2693                                         if (kernel == 0 && use_reserved == 1)
2694                                                 nlck--;
2695                                         *locked += PAGESIZE;
2696                                 }
2697                                 shmd->shm_lckpgs++;
2698                                 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2699                                 if (lockmap != NULL)
2700                                         BT_SET(lockmap, pos);
2701                         }
2702                 }
2703         }
2704         /* Return unused lock reservation */
2705         if (nlck != 0 && use_reserved == 1) {
2706                 mutex_enter(&freemem_lock);
2707                 availrmem       += nlck;
2708                 pages_locked    -= nlck;
2709                 mutex_exit(&freemem_lock);
2710         }
2711
2712         return (rv);
2713 }
2714
2715 int
2716 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2717     rctl_qty_t *unlocked)
2718 {
2719         struct shm_data *shmd = seg->s_data;
2720         struct spt_data *sptd = shmd->shm_sptseg->s_data;
2721         struct anon_map *amp = sptd->spt_amp;
2722         struct anon     *ap;
2723         struct vnode    *vp;
2724         u_offset_t      off;
2725         struct page     *pp;
2726         int             kernel;
2727         anon_sync_obj_t cookie;
2728         ulong_t         i;
2729         pgcnt_t         nlck = 0;
2730         pgcnt_t         nlck_limit = NLCK;
2731
2732         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
2733         for (i = 0; i < npages; i++, anon_index++) {
2734                 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2735                         anon_array_enter(amp, anon_index, &cookie);
2736                         ap = anon_get_ptr(amp->ahp, anon_index);
2737                         ASSERT(ap);
2738
2739                         swap_xlate(ap, &vp, &off);
2740                         anon_array_exit(&cookie);
2741                         pp = page_lookup(vp, off, SE_SHARED);
2742                         ASSERT(pp);
2743                         /*
2744                          * availrmem is decremented only for pages which are not
2745                          * in seg pcache, for pages in seg pcache availrmem was
2746                          * decremented in _dismpagelock()
2747                          */
2748                         kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2749                         ASSERT(pp->p_lckcnt > 0);
2750
2751                         /*
2752                          * lock page but do not change availrmem, we do it
2753                          * ourselves every nlck loops.
2754                          */
2755                         page_pp_unlock(pp, 0, 1);
2756                         if (pp->p_lckcnt == 0) {
2757                                 if (kernel == 0)
2758                                         nlck++;
2759                                 *unlocked += PAGESIZE;
2760                         }
2761                         page_unlock(pp);
2762                         shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2763                         sptd->spt_ppa_lckcnt[anon_index]--;
2764                         shmd->shm_lckpgs--;
2765                 }
2766
2767                 /*
2768                  * To reduce freemem_lock contention, do not update availrmem
2769                  * until at least NLCK pages have been unlocked.
2770                  * 1. No need to update if nlck is zero
2771                  * 2. Always update if the last iteration
2772                  */
2773                 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2774                         mutex_enter(&freemem_lock);
2775                         availrmem       += nlck;
2776                         pages_locked    -= nlck;
2777                         mutex_exit(&freemem_lock);
2778                         nlck = 0;
2779                         nlck_limit = NLCK + RAND_P2(NLCK);
2780                 }
2781         }
2782         ANON_LOCK_EXIT(&amp->a_rwlock);
2783
2784         return (0);
2785 }
2786
2787 /*ARGSUSED*/
2788 static int
2789 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2790     int attr, int op, ulong_t *lockmap, size_t pos)
2791 {
2792         struct shm_data *shmd = seg->s_data;
2793         struct seg      *sptseg = shmd->shm_sptseg;
2794         struct spt_data *sptd = sptseg->s_data;
2795         struct kshmid   *sp = sptd->spt_amp->a_sp;
2796         pgcnt_t         npages, a_npages;
2797         page_t          **ppa;
2798         pgcnt_t         an_idx, a_an_idx, ppa_idx;
2799         caddr_t         spt_addr, a_addr;       /* spt and aligned address */
2800         size_t          a_len;                  /* aligned len */
2801         size_t          share_sz;
2802         ulong_t         i;
2803         int             sts = 0;
2804         rctl_qty_t      unlocked = 0;
2805         rctl_qty_t      locked = 0;
2806         struct proc     *p = curproc;
2807         kproject_t      *proj;
2808
2809         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2810         ASSERT(sp != NULL);
2811
2812         if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2813                 return (0);
2814         }
2815
2816         addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2817         an_idx = seg_page(seg, addr);
2818         npages = btopr(len);
2819
2820         if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2821                 return (ENOMEM);
2822         }
2823
2824         /*
2825          * A shm's project never changes, so no lock needed.
2826          * The shm has a hold on the project, so it will not go away.
2827          * Since we have a mapping to shm within this zone, we know
2828          * that the zone will not go away.
2829          */
2830         proj = sp->shm_perm.ipc_proj;
2831
2832         if (op == MC_LOCK) {
2833
2834                 /*
2835                  * Need to align addr and size request if they are not
2836                  * aligned so we can always allocate large page(s) however
2837                  * we only lock what was requested in initial request.
2838                  */
2839                 share_sz = page_get_pagesize(sptseg->s_szc);
2840                 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2841                 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2842                     share_sz);
2843                 a_npages = btop(a_len);
2844                 a_an_idx = seg_page(seg, a_addr);
2845                 spt_addr = sptseg->s_base + ptob(a_an_idx);
2846                 ppa_idx = an_idx - a_an_idx;
2847
2848                 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2849                     KM_NOSLEEP)) == NULL) {
2850                         return (ENOMEM);
2851                 }
2852
2853                 /*
2854                  * Don't cache any new pages for IO and
2855                  * flush any cached pages.
2856                  */
2857                 mutex_enter(&sptd->spt_lock);
2858                 if (sptd->spt_ppa != NULL)
2859                         sptd->spt_flags |= DISM_PPA_CHANGED;
2860
2861                 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2862                 if (sts != 0) {
2863                         mutex_exit(&sptd->spt_lock);
2864                         kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2865                         return (sts);
2866                 }
2867
2868                 mutex_enter(&sp->shm_mlock);
2869                 /* enforce locked memory rctl */
2870                 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2871
2872                 mutex_enter(&p->p_lock);
2873                 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2874                         mutex_exit(&p->p_lock);
2875                         sts = EAGAIN;
2876                 } else {
2877                         mutex_exit(&p->p_lock);
2878                         sts = spt_lockpages(seg, an_idx, npages,
2879                             &ppa[ppa_idx], lockmap, pos, &locked);
2880
2881                         /*
2882                          * correct locked count if not all pages could be
2883                          * locked
2884                          */
2885                         if ((unlocked - locked) > 0) {
2886                                 rctl_decr_locked_mem(NULL, proj,
2887                                     (unlocked - locked), 0);
2888                         }
2889                 }
2890                 /*
2891                  * unlock pages
2892                  */
2893                 for (i = 0; i < a_npages; i++)
2894                         page_unlock(ppa[i]);
2895                 if (sptd->spt_ppa != NULL)
2896                         sptd->spt_flags |= DISM_PPA_CHANGED;
2897                 mutex_exit(&sp->shm_mlock);
2898                 mutex_exit(&sptd->spt_lock);
2899
2900                 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2901
2902         } else if (op == MC_UNLOCK) { /* unlock */
2903                 page_t          **ppa;
2904
2905                 mutex_enter(&sptd->spt_lock);
2906                 if (shmd->shm_lckpgs == 0) {
2907                         mutex_exit(&sptd->spt_lock);
2908                         return (0);
2909                 }
2910                 /*
2911                  * Don't cache new IO pages.
2912                  */
2913                 if (sptd->spt_ppa != NULL)
2914                         sptd->spt_flags |= DISM_PPA_CHANGED;
2915
2916                 mutex_enter(&sp->shm_mlock);
2917                 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2918                 if ((ppa = sptd->spt_ppa) != NULL)
2919                         sptd->spt_flags |= DISM_PPA_CHANGED;
2920                 mutex_exit(&sptd->spt_lock);
2921
2922                 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2923                 mutex_exit(&sp->shm_mlock);
2924
2925                 if (ppa != NULL)
2926                         seg_ppurge_wiredpp(ppa);
2927         }
2928         return (sts);
2929 }
2930
2931 /*ARGSUSED*/
2932 int
2933 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2934 {
2935         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2936         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2937         spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2938
2939         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2940
2941         /*
2942          * ISM segment is always rw.
2943          */
2944         while (--pgno >= 0)
2945                 *protv++ = sptd->spt_prot;
2946         return (0);
2947 }
2948
2949 /*ARGSUSED*/
2950 u_offset_t
2951 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2952 {
2953         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2954
2955         /* Offset does not matter in ISM memory */
2956
2957         return ((u_offset_t)0);
2958 }
2959
2960 /* ARGSUSED */
2961 int
2962 segspt_shmgettype(struct seg *seg, caddr_t addr)
2963 {
2964         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2965         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2966
2967         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2968
2969         /*
2970          * The shared memory mapping is always MAP_SHARED, SWAP is only
2971          * reserved for DISM
2972          */
2973         return (MAP_SHARED |
2974             ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2975 }
2976
2977 /*ARGSUSED*/
2978 int
2979 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2980 {
2981         struct shm_data *shmd = (struct shm_data *)seg->s_data;
2982         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2983
2984         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2985
2986         *vpp = sptd->spt_vp;
2987         return (0);
2988 }
2989
2990 /*
2991  * We need to wait for pending IO to complete to a DISM segment in order for
2992  * pages to get kicked out of the seg_pcache.  120 seconds should be more
2993  * than enough time to wait.
2994  */
2995 static clock_t spt_pcache_wait = 120;
2996
2997 /*ARGSUSED*/
2998 static int
2999 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
3000 {
3001         struct shm_data *shmd = (struct shm_data *)seg->s_data;
3002         struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
3003         struct anon_map *amp;
3004         pgcnt_t pg_idx;
3005         ushort_t gen;
3006         clock_t end_lbolt;
3007         int writer;
3008         page_t **ppa;
3009
3010         ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
3011
3012         if (behav == MADV_FREE || behav == MADV_PURGE) {
3013                 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
3014                         return (0);
3015
3016                 amp = sptd->spt_amp;
3017                 pg_idx = seg_page(seg, addr);
3018
3019                 mutex_enter(&sptd->spt_lock);
3020                 if ((ppa = sptd->spt_ppa) == NULL) {
3021                         mutex_exit(&sptd->spt_lock);
3022                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3023                         (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3024                         ANON_LOCK_EXIT(&amp->a_rwlock);
3025                         return (0);
3026                 }
3027
3028                 sptd->spt_flags |= DISM_PPA_CHANGED;
3029                 gen = sptd->spt_gen;
3030
3031                 mutex_exit(&sptd->spt_lock);
3032
3033                 /*
3034                  * Purge all DISM cached pages
3035                  */
3036                 seg_ppurge_wiredpp(ppa);
3037
3038                 /*
3039                  * Drop the AS_LOCK so that other threads can grab it
3040                  * in the as_pageunlock path and hopefully get the segment
3041                  * kicked out of the seg_pcache.  We bump the shm_softlockcnt
3042                  * to keep this segment resident.
3043                  */
3044                 writer = AS_WRITE_HELD(seg->s_as);
3045                 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3046                 AS_LOCK_EXIT(seg->s_as);
3047
3048                 mutex_enter(&sptd->spt_lock);
3049
3050                 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
3051
3052                 /*
3053                  * Try to wait for pages to get kicked out of the seg_pcache.
3054                  */
3055                 while (sptd->spt_gen == gen &&
3056                     (sptd->spt_flags & DISM_PPA_CHANGED) &&
3057                     ddi_get_lbolt() < end_lbolt) {
3058                         if (!cv_timedwait_sig(&sptd->spt_cv,
3059                             &sptd->spt_lock, end_lbolt)) {
3060                                 break;
3061                         }
3062                 }
3063
3064                 mutex_exit(&sptd->spt_lock);
3065
3066                 /* Regrab the AS_LOCK and release our hold on the segment */
3067                 AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
3068                 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
3069                 if (shmd->shm_softlockcnt <= 0) {
3070                         if (AS_ISUNMAPWAIT(seg->s_as)) {
3071                                 mutex_enter(&seg->s_as->a_contents);
3072                                 if (AS_ISUNMAPWAIT(seg->s_as)) {
3073                                         AS_CLRUNMAPWAIT(seg->s_as);
3074                                         cv_broadcast(&seg->s_as->a_cv);
3075                                 }
3076                                 mutex_exit(&seg->s_as->a_contents);
3077                         }
3078                 }
3079
3080                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3081                 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
3082                 ANON_LOCK_EXIT(&amp->a_rwlock);
3083         } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
3084             behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
3085                 int                     already_set;
3086                 ulong_t                 anon_index;
3087                 lgrp_mem_policy_t       policy;
3088                 caddr_t                 shm_addr;
3089                 size_t                  share_size;
3090                 size_t                  size;
3091                 struct seg              *sptseg = shmd->shm_sptseg;
3092                 caddr_t                 sptseg_addr;
3093
3094                 /*
3095                  * Align address and length to page size of underlying segment
3096                  */
3097                 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
3098                 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
3099                 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
3100                     share_size);
3101
3102                 amp = shmd->shm_amp;
3103                 anon_index = seg_page(seg, shm_addr);
3104
3105                 /*
3106                  * And now we may have to adjust size downward if we have
3107                  * exceeded the realsize of the segment or initial anon
3108                  * allocations.
3109                  */
3110                 sptseg_addr = sptseg->s_base + ptob(anon_index);
3111                 if ((sptseg_addr + size) >
3112                     (sptseg->s_base + sptd->spt_realsize))
3113                         size = (sptseg->s_base + sptd->spt_realsize) -
3114                             sptseg_addr;
3115
3116                 /*
3117                  * Set memory allocation policy for this segment
3118                  */
3119                 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
3120                 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
3121                     NULL, 0, len);
3122
3123                 /*
3124                  * If random memory allocation policy set already,
3125                  * don't bother reapplying it.
3126                  */
3127                 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
3128                         return (0);
3129
3130                 /*
3131                  * Mark any existing pages in the given range for
3132                  * migration, flushing the I/O page cache, and using
3133                  * underlying segment to calculate anon index and get
3134                  * anonmap and vnode pointer from
3135                  */
3136                 if (shmd->shm_softlockcnt > 0)
3137                         segspt_purge(seg);
3138
3139                 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
3140         }
3141
3142         return (0);
3143 }
3144
3145 /*ARGSUSED*/
3146 void
3147 segspt_shmdump(struct seg *seg)
3148 {
3149         /* no-op for ISM segment */
3150 }
3151
3152 /*ARGSUSED*/
3153 static int
3154 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3155 {
3156         return (ENOTSUP);
3157 }
3158
3159 /*
3160  * get a memory ID for an addr in a given segment
3161  */
3162 static int
3163 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3164 {
3165         struct shm_data *shmd = (struct shm_data *)seg->s_data;
3166         struct anon     *ap;
3167         size_t          anon_index;
3168         struct anon_map *amp = shmd->shm_amp;
3169         struct spt_data *sptd = shmd->shm_sptseg->s_data;
3170         struct seg      *sptseg = shmd->shm_sptseg;
3171         anon_sync_obj_t cookie;
3172
3173         anon_index = seg_page(seg, addr);
3174
3175         if (addr > (seg->s_base + sptd->spt_realsize)) {
3176                 return (EFAULT);
3177         }
3178
3179         ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
3180         anon_array_enter(amp, anon_index, &cookie);
3181         ap = anon_get_ptr(amp->ahp, anon_index);
3182         if (ap == NULL) {
3183                 struct page *pp;
3184                 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3185
3186                 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3187                 if (pp == NULL) {
3188                         anon_array_exit(&cookie);
3189                         ANON_LOCK_EXIT(&amp->a_rwlock);
3190                         return (ENOMEM);
3191                 }
3192                 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3193                 page_unlock(pp);
3194         }
3195         anon_array_exit(&cookie);
3196         ANON_LOCK_EXIT(&amp->a_rwlock);
3197         memidp->val[0] = (uintptr_t)ap;
3198         memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3199         return (0);
3200 }
3201
3202 /*
3203  * Get memory allocation policy info for specified address in given segment
3204  */
3205 static lgrp_mem_policy_info_t *
3206 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3207 {
3208         struct anon_map         *amp;
3209         ulong_t                 anon_index;
3210         lgrp_mem_policy_info_t  *policy_info;
3211         struct shm_data         *shm_data;
3212
3213         ASSERT(seg != NULL);
3214
3215         /*
3216          * Get anon_map from segshm
3217          *
3218          * Assume that no lock needs to be held on anon_map, since
3219          * it should be protected by its reference count which must be
3220          * nonzero for an existing segment
3221          * Need to grab readers lock on policy tree though
3222          */
3223         shm_data = (struct shm_data *)seg->s_data;
3224         if (shm_data == NULL)
3225                 return (NULL);
3226         amp = shm_data->shm_amp;
3227         ASSERT(amp->refcnt != 0);
3228
3229         /*
3230          * Get policy info
3231          *
3232          * Assume starting anon index of 0
3233          */
3234         anon_index = seg_page(seg, addr);
3235         policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3236
3237         return (policy_info);
3238 }
3239
3240 /*ARGSUSED*/
3241 static int
3242 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3243 {
3244         return (0);
3245 }