kernel/vm/vm_anon.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2015, Joyent, Inc. All rights reserved.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 /*
  40  * VM - anonymous pages.
  41  *
  42  * This layer sits immediately above the vm_swap layer.  It manages
  43  * physical pages that have no permanent identity in the file system
  44  * name space, using the services of the vm_swap layer to allocate
  45  * backing storage for these pages.  Since these pages have no external
  46  * identity, they are discarded when the last reference is removed.
  47  *
  48  * An important function of this layer is to manage low-level sharing
  49  * of pages that are logically distinct but that happen to be
  50  * physically identical (e.g., the corresponding pages of the processes
  51  * resulting from a fork before one process or the other changes their
  52  * contents).  This pseudo-sharing is present only as an optimization
  53  * and is not to be confused with true sharing in which multiple
  54  * address spaces deliberately contain references to the same object;
  55  * such sharing is managed at a higher level.
  56  *
  57  * The key data structure here is the anon struct, which contains a
  58  * reference count for its associated physical page and a hint about
  59  * the identity of that page.  Anon structs typically live in arrays,
  60  * with an instance's position in its array determining where the
  61  * corresponding backing storage is allocated; however, the swap_xlate()
  62  * routine abstracts away this representation information so that the
  63  * rest of the anon layer need not know it.  (See the swap layer for
  64  * more details on anon struct layout.)
  65  *
  66  * In the future versions of the system, the association between an
  67  * anon struct and its position on backing store will change so that
  68  * we don't require backing store all anonymous pages in the system.
  69  * This is important for consideration for large memory systems.
  70  * We can also use this technique to delay binding physical locations
  71  * to anonymous pages until pageout time where we can make smarter
  72  * allocation decisions to improve anonymous klustering.
  73  *
  74  * Many of the routines defined here take a (struct anon **) argument,
  75  * which allows the code at this level to manage anon pages directly,
  76  * so that callers can regard anon structs as opaque objects and not be
  77  * concerned with assigning or inspecting their contents.
  78  *
  79  * Clients of this layer refer to anon pages indirectly.  That is, they
  80  * maintain arrays of pointers to anon structs rather than maintaining
  81  * anon structs themselves.  The (struct anon **) arguments mentioned
  82  * above are pointers to entries in these arrays.  It is these arrays
  83  * that capture the mapping between offsets within a given segment and
  84  * the corresponding anonymous backing storage address.
  85  */
  86
  87 #ifdef DEBUG
  88 #define ANON_DEBUG
  89 #endif
  90
  91 #include <sys/types.h>
  92 #include <sys/t_lock.h>
  93 #include <sys/param.h>
  94 #include <sys/systm.h>
  95 #include <sys/mman.h>
  96 #include <sys/cred.h>
  97 #include <sys/thread.h>
  98 #include <sys/vnode.h>
  99 #include <sys/cpuvar.h>
 100 #include <sys/swap.h>
 101 #include <sys/cmn_err.h>
 102 #include <sys/vtrace.h>
 103 #include <sys/kmem.h>
 104 #include <sys/sysmacros.h>
 105 #include <sys/bitmap.h>
 106 #include <sys/vmsystm.h>
 107 #include <sys/tuneable.h>
 108 #include <sys/debug.h>
 109 #include <sys/fs/swapnode.h>
 110 #include <sys/tnf_probe.h>
 111 #include <sys/lgrp.h>
 112 #include <sys/policy.h>
 113 #include <sys/condvar_impl.h>
 114 #include <sys/mutex_impl.h>
 115 #include <sys/rctl.h>
 116
 117 #include <vm/as.h>
 118 #include <vm/hat.h>
 119 #include <vm/anon.h>
 120 #include <vm/page.h>
 121 #include <vm/vpage.h>
 122 #include <vm/seg.h>
 123 #include <vm/rm.h>
 124
 125 #include <sys/fs_subr.h>
 126
 127 struct vnode *anon_vp;
 128
 129 int anon_debug;
 130
 131 kmutex_t        anoninfo_lock;
 132 struct          k_anoninfo k_anoninfo;
 133 ani_free_t      *ani_free_pool;
 134 pad_mutex_t     anon_array_lock[ANON_LOCKSIZE];
 135 kcondvar_t      anon_array_cv[ANON_LOCKSIZE];
 136
 137 /*
 138  * Global hash table for (vp, off) -> anon slot
 139  */
 140 extern  int swap_maxcontig;
 141 size_t  anon_hash_size;
 142 unsigned int anon_hash_shift;
 143 struct anon **anon_hash;
 144
 145 static struct kmem_cache *anon_cache;
 146 static struct kmem_cache *anonmap_cache;
 147
 148 pad_mutex_t     *anonhash_lock;
 149
 150 /*
 151  * Used to make the increment of all refcnts of all anon slots of a large
 152  * page appear to be atomic.  The lock is grabbed for the first anon slot of
 153  * a large page.
 154  */
 155 pad_mutex_t     *anonpages_hash_lock;
 156
 157 #define APH_MUTEX(vp, off)                              \
 158         (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
 159             (AH_LOCK_SIZE - 1))].pad_mutex)
 160
 161 #ifdef VM_STATS
 162 static struct anonvmstats_str {
 163         ulong_t getpages[30];
 164         ulong_t privatepages[10];
 165         ulong_t demotepages[9];
 166         ulong_t decrefpages[9];
 167         ulong_t dupfillholes[4];
 168         ulong_t freepages[1];
 169 } anonvmstats;
 170 #endif /* VM_STATS */
 171
 172 /*ARGSUSED*/
 173 static int
 174 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
 175 {
 176         struct anon_map *amp = buf;
 177
 178         rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
 179         cv_init(&amp->a_purgecv, NULL, CV_DEFAULT, NULL);
 180         mutex_init(&amp->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
 181         mutex_init(&amp->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
 182         return (0);
 183 }
 184
 185 /*ARGSUSED1*/
 186 static void
 187 anonmap_cache_destructor(void *buf, void *cdrarg)
 188 {
 189         struct anon_map *amp = buf;
 190
 191         rw_destroy(&amp->a_rwlock);
 192         cv_destroy(&amp->a_purgecv);
 193         mutex_destroy(&amp->a_pmtx);
 194         mutex_destroy(&amp->a_purgemtx);
 195 }
 196
 197 void
 198 anon_init(void)
 199 {
 200         int i;
 201         pad_mutex_t *tmp;
 202
 203         /* These both need to be powers of 2 so round up to the next power */
 204         anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
 205         anon_hash_size = 1L << anon_hash_shift;
 206
 207         /*
 208          * We need to align the anonhash_lock and anonpages_hash_lock arrays
 209          * to a 64B boundary to avoid false sharing.  We add 63B to our
 210          * allocation so that we can get a 64B aligned address to use.
 211          * We allocate both of these together to avoid wasting an additional
 212          * 63B.
 213          */
 214         tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
 215             KM_SLEEP);
 216         anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
 217         anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
 218
 219         for (i = 0; i < AH_LOCK_SIZE; i++) {
 220                 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
 221                     NULL);
 222                 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
 223                     MUTEX_DEFAULT, NULL);
 224         }
 225
 226         for (i = 0; i < ANON_LOCKSIZE; i++) {
 227                 mutex_init(&anon_array_lock[i].pad_mutex, NULL,
 228                     MUTEX_DEFAULT, NULL);
 229                 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
 230         }
 231
 232         anon_hash = (struct anon **)
 233             kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
 234         anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
 235             AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
 236         anonmap_cache = kmem_cache_create("anonmap_cache",
 237             sizeof (struct anon_map), 0,
 238             anonmap_cache_constructor, anonmap_cache_destructor, NULL,
 239             NULL, NULL, 0);
 240         swap_maxcontig = (1024 * 1024) >> PAGESHIFT;    /* 1MB of pages */
 241
 242         tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
 243         /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
 244         ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
 245
 246         anon_vp = vn_alloc(KM_SLEEP);
 247         vn_setops(anon_vp, &swap_vnodeops);
 248         anon_vp->v_type = VREG;
 249         anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
 250 }
 251
 252 /*
 253  * Global anon slot hash table manipulation.
 254  */
 255
 256 static void
 257 anon_addhash(struct anon *ap)
 258 {
 259         int index;
 260
 261         ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
 262         index = ANON_HASH(ap->an_vp, ap->an_off);
 263         ap->an_hash = anon_hash[index];
 264         anon_hash[index] = ap;
 265 }
 266
 267 static void
 268 anon_rmhash(struct anon *ap)
 269 {
 270         struct anon **app;
 271
 272         ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
 273
 274         for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
 275             *app; app = &((*app)->an_hash)) {
 276                 if (*app == ap) {
 277                         *app = ap->an_hash;
 278                         break;
 279                 }
 280         }
 281 }
 282
 283 /*
 284  * The anon array interfaces. Functions allocating,
 285  * freeing array of pointers, and returning/setting
 286  * entries in the array of pointers for a given offset.
 287  *
 288  * Create the list of pointers
 289  */
 290 struct anon_hdr *
 291 anon_create(pgcnt_t npages, int flags)
 292 {
 293         struct anon_hdr *ahp;
 294         ulong_t nchunks;
 295         int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 296
 297         if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
 298                 return (NULL);
 299         }
 300
 301         mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
 302         /*
 303          * Single level case.
 304          */
 305         ahp->size = npages;
 306         if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
 307
 308                 if (flags & ANON_ALLOC_FORCE)
 309                         ahp->flags |= ANON_ALLOC_FORCE;
 310
 311                 ahp->array_chunk = kmem_zalloc(
 312                     ahp->size * sizeof (struct anon *), kmemflags);
 313
 314                 if (ahp->array_chunk == NULL) {
 315                         kmem_free(ahp, sizeof (struct anon_hdr));
 316                         return (NULL);
 317                 }
 318         } else {
 319                 /*
 320                  * 2 Level case.
 321                  * anon hdr size needs to be rounded off  to be a multiple
 322                  * of ANON_CHUNK_SIZE. This is important as various anon
 323                  * related functions depend on this.
 324                  * NOTE -
 325                  * anon_grow()  makes anon hdr size a multiple of
 326                  * ANON_CHUNK_SIZE.
 327                  * amp size is <= anon hdr size.
 328                  * anon_index + seg_pgs <= anon hdr size.
 329                  */
 330                 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
 331                 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
 332
 333                 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
 334                     kmemflags);
 335
 336                 if (ahp->array_chunk == NULL) {
 337                         kmem_free(ahp, sizeof (struct anon_hdr));
 338                         return (NULL);
 339                 }
 340         }
 341         return (ahp);
 342 }
 343
 344 /*
 345  * Free the array of pointers
 346  */
 347 void
 348 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
 349 {
 350         ulong_t i;
 351         void **ppp;
 352         ulong_t nchunks;
 353
 354         ASSERT(npages <= ahp->size);
 355
 356         /*
 357          * Single level case.
 358          */
 359         if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 360                 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
 361         } else {
 362                 /*
 363                  * 2 level case.
 364                  */
 365                 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
 366                 for (i = 0; i < nchunks; i++) {
 367                         ppp = &ahp->array_chunk[i];
 368                         if (*ppp != NULL)
 369                                 kmem_free(*ppp, PAGESIZE);
 370                 }
 371                 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
 372         }
 373         mutex_destroy(&ahp->serial_lock);
 374         kmem_free(ahp, sizeof (struct anon_hdr));
 375 }
 376
 377 /*
 378  * Return the pointer from the list for a
 379  * specified anon index.
 380  */
 381 struct anon *
 382 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
 383 {
 384         struct anon **app;
 385
 386         ASSERT(an_idx < ahp->size);
 387
 388         /*
 389          * Single level case.
 390          */
 391         if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
 392                 return ((struct anon *)
 393                     ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
 394         } else {
 395
 396                 /*
 397                  * 2 level case.
 398                  */
 399                 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 400                 if (app) {
 401                         return ((struct anon *)
 402                             ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
 403                             ANON_PTRMASK));
 404                 } else {
 405                         return (NULL);
 406                 }
 407         }
 408 }
 409
 410 /*
 411  * Return the anon pointer for the first valid entry in the anon list,
 412  * starting from the given index.
 413  */
 414 struct anon *
 415 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
 416 {
 417         struct anon *ap;
 418         struct anon **app;
 419         ulong_t chunkoff;
 420         ulong_t i;
 421         ulong_t j;
 422         pgcnt_t size;
 423
 424         i = *index;
 425         size = ahp->size;
 426
 427         ASSERT(i < size);
 428
 429         if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
 430                 /*
 431                  * 1 level case
 432                  */
 433                 while (i < size) {
 434                         ap = (struct anon *)
 435                             ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
 436                         if (ap) {
 437                                 *index = i;
 438                                 return (ap);
 439                         }
 440                         i++;
 441                 }
 442         } else {
 443                 /*
 444                  * 2 level case
 445                  */
 446                 chunkoff = i & ANON_CHUNK_OFF;
 447                 while (i < size) {
 448                         app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
 449                         if (app)
 450                                 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
 451                                         ap = (struct anon *)
 452                                             ((uintptr_t)app[j] & ANON_PTRMASK);
 453                                         if (ap) {
 454                                                 *index = i + (j - chunkoff);
 455                                                 return (ap);
 456                                         }
 457                                 }
 458                         chunkoff = 0;
 459                         i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
 460                 }
 461         }
 462         *index = size;
 463         return (NULL);
 464 }
 465
 466 /*
 467  * Set list entry with a given pointer for a specified offset
 468  */
 469 int
 470 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
 471 {
 472         void            **ppp;
 473         struct anon     **app;
 474         int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 475         uintptr_t       *ap_addr;
 476
 477         ASSERT(an_idx < ahp->size);
 478
 479         /*
 480          * Single level case.
 481          */
 482         if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 483                 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
 484         } else {
 485
 486                 /*
 487                  * 2 level case.
 488                  */
 489                 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 490
 491                 ASSERT(ppp != NULL);
 492                 if (*ppp == NULL) {
 493                         mutex_enter(&ahp->serial_lock);
 494                         ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
 495                         if (*ppp == NULL) {
 496                                 *ppp = kmem_zalloc(PAGESIZE, kmemflags);
 497                                 if (*ppp == NULL) {
 498                                         mutex_exit(&ahp->serial_lock);
 499                                         return (ENOMEM);
 500                                 }
 501                         }
 502                         mutex_exit(&ahp->serial_lock);
 503                 }
 504                 app = *ppp;
 505                 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
 506         }
 507         *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
 508         return (0);
 509 }
 510
 511 /*
 512  * Copy anon array into a given new anon array
 513  */
 514 int
 515 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
 516         struct anon_hdr *dahp, ulong_t d_idx,
 517         pgcnt_t npages, int flags)
 518 {
 519         void **sapp, **dapp;
 520         void *ap;
 521         int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 522
 523         ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
 524         ASSERT((npages <= sahp->size) && (npages <= dahp->size));
 525
 526         /*
 527          * Both arrays are 1 level.
 528          */
 529         if (((sahp->size <= ANON_CHUNK_SIZE) &&
 530             (dahp->size <= ANON_CHUNK_SIZE)) ||
 531             ((sahp->flags & ANON_ALLOC_FORCE) &&
 532             (dahp->flags & ANON_ALLOC_FORCE))) {
 533
 534                 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
 535                     npages * sizeof (struct anon *));
 536                 return (0);
 537         }
 538
 539         /*
 540          * Both arrays are 2 levels.
 541          */
 542         if (sahp->size > ANON_CHUNK_SIZE &&
 543             dahp->size > ANON_CHUNK_SIZE &&
 544             ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
 545             ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
 546
 547                 ulong_t sapidx, dapidx;
 548                 ulong_t *sap, *dap;
 549                 ulong_t chknp;
 550
 551                 while (npages != 0) {
 552
 553                         sapidx = s_idx & ANON_CHUNK_OFF;
 554                         dapidx = d_idx & ANON_CHUNK_OFF;
 555                         chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
 556                         if (chknp > npages)
 557                                 chknp = npages;
 558
 559                         sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
 560                         if ((sap = *sapp) != NULL) {
 561                                 dapp = &dahp->array_chunk[d_idx
 562                                     >> ANON_CHUNK_SHIFT];
 563                                 if ((dap = *dapp) == NULL) {
 564                                         *dapp = kmem_zalloc(PAGESIZE,
 565                                             kmemflags);
 566                                         if ((dap = *dapp) == NULL)
 567                                                 return (ENOMEM);
 568                                 }
 569                                 bcopy((sap + sapidx), (dap + dapidx),
 570                                     chknp << ANON_PTRSHIFT);
 571                         }
 572                         s_idx += chknp;
 573                         d_idx += chknp;
 574                         npages -= chknp;
 575                 }
 576                 return (0);
 577         }
 578
 579         /*
 580          * At least one of the arrays is 2 level.
 581          */
 582         while (npages--) {
 583                 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
 584                         ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
 585                         if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
 586                                         return (ENOMEM);
 587                 }
 588                 s_idx++;
 589                 d_idx++;
 590         }
 591         return (0);
 592 }
 593
 594
 595 /*
 596  * ANON_INITBUF is a convenience macro for anon_grow() below. It
 597  * takes a buffer dst, which is at least as large as buffer src. It
 598  * does a bcopy from src into dst, and then bzeros the extra bytes
 599  * of dst. If tail is set, the data in src is tail aligned within
 600  * dst instead of head aligned.
 601  */
 602
 603 #define ANON_INITBUF(src, srclen, dst, dstsize, tail)                         \
 604         if (tail) {                                                           \
 605                 bzero((dst), (dstsize) - (srclen));                           \
 606                 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
 607         } else {                                                              \
 608                 bcopy((src), (dst), (srclen));                                \
 609                 bzero((char *)(dst) + (srclen), (dstsize) - (srclen));        \
 610         }
 611
 612 #define ANON_1_LEVEL_INC        (ANON_CHUNK_SIZE / 8)
 613 #define ANON_2_LEVEL_INC        (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
 614
 615 /*
 616  * anon_grow() is used to efficiently extend an existing anon array.
 617  * startidx_p points to the index into the anon array of the first page
 618  * that is in use. oldseg_pgs is the number of pages in use, starting at
 619  * *startidx_p. newpages is the number of additional pages desired.
 620  *
 621  * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
 622  *
 623  * The growth is done by creating a new top level of the anon array,
 624  * and (if the array is 2-level) reusing the existing second level arrays.
 625  *
 626  * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
 627  *
 628  * Returns the new number of pages in the anon array.
 629  */
 630 pgcnt_t
 631 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
 632     pgcnt_t newseg_pgs, int flags)
 633 {
 634         ulong_t startidx = startidx_p ? *startidx_p : 0;
 635         pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
 636         pgcnt_t oelems, nelems, totpages;
 637         void **level1;
 638         int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
 639         int growdown = (flags & ANON_GROWDOWN);
 640         size_t newarrsz, oldarrsz;
 641         void *level2;
 642
 643         ASSERT(!(startidx_p == NULL && growdown));
 644         ASSERT(startidx + oldseg_pgs <= ahp->size);
 645
 646         /*
 647          * Determine the total number of pages needed in the new
 648          * anon array. If growing down, totpages is all pages from
 649          * startidx through the end of the array, plus <newseg_pgs>
 650          * pages. If growing up, keep all pages from page 0 through
 651          * the last page currently in use, plus <newseg_pgs> pages.
 652          */
 653         if (growdown)
 654                 totpages = oldamp_pgs - startidx + newseg_pgs;
 655         else
 656                 totpages = startidx + oldseg_pgs + newseg_pgs;
 657
 658         /* If the array is already large enough, just return. */
 659
 660         if (oldamp_pgs >= totpages) {
 661                 if (growdown)
 662                         *startidx_p = oldamp_pgs - totpages;
 663                 return (oldamp_pgs);
 664         }
 665
 666         /*
 667          * oldamp_pgs/newamp_pgs are the total numbers of pages represented
 668          * by the corresponding arrays.
 669          * oelems/nelems are the number of pointers in the top level arrays
 670          * which may be either level 1 or level 2.
 671          * Will the new anon array be one level or two levels?
 672          */
 673         if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
 674                 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
 675                 oelems = oldamp_pgs;
 676                 nelems = newamp_pgs;
 677         } else {
 678                 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
 679                 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
 680                 nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
 681         }
 682
 683         newarrsz = nelems * sizeof (void *);
 684         level1 = kmem_alloc(newarrsz, kmemflags);
 685         if (level1 == NULL)
 686                 return (0);
 687
 688         /* Are we converting from a one level to a two level anon array? */
 689
 690         if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
 691             !(ahp->flags & ANON_ALLOC_FORCE)) {
 692
 693                 /*
 694                  * Yes, we're converting to a two level. Reuse old level 1
 695                  * as new level 2 if it is exactly PAGESIZE. Otherwise
 696                  * alloc a new level 2 and copy the old level 1 data into it.
 697                  */
 698                 if (oldamp_pgs == ANON_CHUNK_SIZE) {
 699                         level2 = (void *)ahp->array_chunk;
 700                 } else {
 701                         level2 = kmem_alloc(PAGESIZE, kmemflags);
 702                         if (level2 == NULL) {
 703                                 kmem_free(level1, newarrsz);
 704                                 return (0);
 705                         }
 706                         oldarrsz = oldamp_pgs * sizeof (void *);
 707
 708                         ANON_INITBUF(ahp->array_chunk, oldarrsz,
 709                             level2, PAGESIZE, growdown);
 710                         kmem_free(ahp->array_chunk, oldarrsz);
 711                 }
 712                 bzero(level1, newarrsz);
 713                 if (growdown)
 714                         level1[nelems - 1] = level2;
 715                 else
 716                         level1[0] = level2;
 717         } else {
 718                 oldarrsz = oelems * sizeof (void *);
 719
 720                 ANON_INITBUF(ahp->array_chunk, oldarrsz,
 721                     level1, newarrsz, growdown);
 722                 kmem_free(ahp->array_chunk, oldarrsz);
 723         }
 724
 725         ahp->array_chunk = level1;
 726         ahp->size = newamp_pgs;
 727         if (growdown)
 728                 *startidx_p = newamp_pgs - totpages;
 729
 730         return (newamp_pgs);
 731 }
 732
 733
 734 /*
 735  * Called to sync ani_free value.
 736  */
 737
 738 void
 739 set_anoninfo(void)
 740 {
 741         processorid_t   ix, max_seqid;
 742         pgcnt_t         total = 0;
 743         static clock_t  last_time;
 744         clock_t         new_time;
 745
 746         if (ani_free_pool == NULL)
 747                 return;
 748
 749         /*
 750          * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
 751          * identify the maximum number of CPUs were ever online.
 752          */
 753         new_time = ddi_get_lbolt();
 754         if (new_time > last_time) {
 755
 756                 max_seqid = max_cpu_seqid_ever;
 757                 ASSERT(ANI_MAX_POOL > max_seqid);
 758                 for (ix = 0; ix <= max_seqid; ix++)
 759                         total += ani_free_pool[ix].ani_count;
 760
 761                 last_time = new_time;
 762                 k_anoninfo.ani_free = total;
 763         }
 764 }
 765
 766 /*
 767  * Reserve anon space.
 768  *
 769  * It's no longer simply a matter of incrementing ani_resv to
 770  * reserve swap space, we need to check memory-based as well
 771  * as disk-backed (physical) swap.  The following algorithm
 772  * is used:
 773  *      Check the space on physical swap
 774  *              i.e. amount needed < ani_max - ani_phys_resv
 775  *      If we are swapping on swapfs check
 776  *              amount needed < (availrmem - swapfs_minfree)
 777  * Since the algorithm to check for the quantity of swap space is
 778  * almost the same as that for reserving it, we'll just use anon_resvmem
 779  * with a flag to decrement availrmem.
 780  *
 781  * Return non-zero on success.
 782  */
 783 int
 784 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
 785 {
 786         pgcnt_t npages = btopr(size);
 787         pgcnt_t mswap_pages = 0;
 788         pgcnt_t pswap_pages = 0;
 789         proc_t *p = curproc;
 790
 791         if (zone != NULL) {
 792                 /* test zone.max-swap resource control */
 793                 mutex_enter(&p->p_lock);
 794                 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
 795                         mutex_exit(&p->p_lock);
 796
 797                         if (takemem)
 798                                 atomic_add_64(&zone->zone_anon_alloc_fail, 1);
 799
 800                         return (0);
 801                 }
 802
 803                 if (!takemem)
 804                         rctl_decr_swap(zone, ptob(npages));
 805
 806                 mutex_exit(&p->p_lock);
 807         }
 808         mutex_enter(&anoninfo_lock);
 809
 810         /*
 811          * pswap_pages is the number of pages we can take from
 812          * physical (i.e. disk-backed) swap.
 813          */
 814         ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 815         pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
 816
 817         ANON_PRINT(A_RESV,
 818             ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
 819             npages, takemem, pswap_pages, (void *)caller()));
 820
 821         if (npages <= pswap_pages) {
 822                 /*
 823                  * we have enough space on a physical swap
 824                  */
 825                 if (takemem)
 826                         k_anoninfo.ani_phys_resv += npages;
 827                 mutex_exit(&anoninfo_lock);
 828                 return (1);
 829         } else if (pswap_pages != 0) {
 830                 /*
 831                  * we have some space on a physical swap
 832                  */
 833                 if (takemem) {
 834                         /*
 835                          * use up remainder of phys swap
 836                          */
 837                         k_anoninfo.ani_phys_resv += pswap_pages;
 838                         ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
 839                 }
 840         }
 841         /*
 842          * since (npages > pswap_pages) we need mem swap
 843          * mswap_pages is the number of pages needed from availrmem
 844          */
 845         ASSERT(npages > pswap_pages);
 846         mswap_pages = npages - pswap_pages;
 847
 848         ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
 849             mswap_pages));
 850
 851         /*
 852          * priv processes can reserve memory as swap as long as availrmem
 853          * remains greater than swapfs_minfree; in the case of non-priv
 854          * processes, memory can be reserved as swap only if availrmem
 855          * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
 856          * swapfs_reserve amount of memswap is not available to non-priv
 857          * processes. This protects daemons such as automounter dying
 858          * as a result of application processes eating away almost entire
 859          * membased swap. This safeguard becomes useless if apps are run
 860          * with root access.
 861          *
 862          * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
 863          *
 864          */
 865         if (tryhard) {
 866                 pgcnt_t floor_pages;
 867
 868                 if (secpolicy_resource_anon_mem(CRED())) {
 869                         floor_pages = swapfs_minfree;
 870                 } else {
 871                         floor_pages = swapfs_minfree + swapfs_reserve;
 872                 }
 873
 874                 mutex_exit(&anoninfo_lock);
 875                 (void) page_reclaim_mem(mswap_pages, floor_pages, 0);
 876                 mutex_enter(&anoninfo_lock);
 877         }
 878
 879         mutex_enter(&freemem_lock);
 880         if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
 881             (availrmem > (swapfs_minfree + mswap_pages) &&
 882             secpolicy_resource(CRED()) == 0)) {
 883
 884                 if (takemem) {
 885                         /*
 886                          * Take the memory from the rest of the system.
 887                          */
 888                         availrmem -= mswap_pages;
 889                         mutex_exit(&freemem_lock);
 890                         k_anoninfo.ani_mem_resv += mswap_pages;
 891                         ANI_ADD(mswap_pages);
 892                         ANON_PRINT((A_RESV | A_MRESV),
 893                             ("anon_resvmem: took %ld pages of availrmem\n",
 894                             mswap_pages));
 895                 } else {
 896                         mutex_exit(&freemem_lock);
 897                 }
 898
 899                 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 900                 mutex_exit(&anoninfo_lock);
 901                 return (1);
 902         } else {
 903                 /*
 904                  * Fail if not enough memory
 905                  */
 906                 if (takemem) {
 907                         k_anoninfo.ani_phys_resv -= pswap_pages;
 908                 }
 909
 910                 mutex_exit(&freemem_lock);
 911                 mutex_exit(&anoninfo_lock);
 912                 ANON_PRINT(A_RESV,
 913                     ("anon_resvmem: not enough space from swapfs\n"));
 914                 if (zone != NULL && takemem)
 915                         rctl_decr_swap(zone, ptob(npages));
 916                 return (0);
 917         }
 918 }
 919
 920 /*
 921  * Give back an anon reservation.
 922  */
 923 void
 924 anon_unresvmem(size_t size, zone_t *zone)
 925 {
 926         pgcnt_t npages = btopr(size);
 927         spgcnt_t mem_free_pages = 0;
 928         pgcnt_t phys_free_slots;
 929 #ifdef  ANON_DEBUG
 930         pgcnt_t mem_resv;
 931 #endif
 932         if (zone != NULL)
 933                 rctl_decr_swap(zone, ptob(npages));
 934
 935         mutex_enter(&anoninfo_lock);
 936
 937         ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
 938
 939         /*
 940          * If some of this reservation belonged to swapfs
 941          * give it back to availrmem.
 942          * ani_mem_resv is the amount of availrmem swapfs has reserved.
 943          * but some of that memory could be locked by segspt so we can only
 944          * return non locked ani_mem_resv back to availrmem
 945          */
 946         if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
 947                 ANON_PRINT((A_RESV | A_MRESV),
 948                     ("anon_unresv: growing availrmem by %ld pages\n",
 949                     MIN(k_anoninfo.ani_mem_resv, npages)));
 950
 951                 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
 952                     k_anoninfo.ani_locked_swap), npages);
 953                 mutex_enter(&freemem_lock);
 954                 availrmem += mem_free_pages;
 955                 mutex_exit(&freemem_lock);
 956                 k_anoninfo.ani_mem_resv -= mem_free_pages;
 957
 958                 ANI_ADD(-mem_free_pages);
 959         }
 960         /*
 961          * The remainder of the pages is returned to phys swap
 962          */
 963         ASSERT(npages >= mem_free_pages);
 964         phys_free_slots = npages - mem_free_pages;
 965
 966         if (phys_free_slots) {
 967                 k_anoninfo.ani_phys_resv -= phys_free_slots;
 968         }
 969
 970 #ifdef  ANON_DEBUG
 971         mem_resv = k_anoninfo.ani_mem_resv;
 972 #endif
 973
 974         ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
 975         ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
 976
 977         mutex_exit(&anoninfo_lock);
 978
 979         ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
 980             npages, mem_resv, (void *)caller()));
 981 }
 982
 983 /*
 984  * Allocate an anon slot and return it with the lock held.
 985  */
 986 struct anon *
 987 anon_alloc(struct vnode *vp, anoff_t off)
 988 {
 989         struct anon     *ap;
 990         kmutex_t        *ahm;
 991
 992         ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
 993         if (vp == NULL) {
 994                 swap_alloc(ap);
 995         } else {
 996                 ap->an_vp = vp;
 997                 ap->an_off = off;
 998         }
 999         ap->an_refcnt = 1;
1000         ap->an_pvp = NULL;
1001         ap->an_poff = 0;
1002         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1003         mutex_enter(ahm);
1004         anon_addhash(ap);
1005         mutex_exit(ahm);
1006         ANI_ADD(-1);
1007         ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
1008             (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1009         return (ap);
1010 }
1011
1012 /*
1013  * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1014  * such pages don't consume any physical swap resources needed for swapping
1015  * unlocked pages.
1016  */
1017 void
1018 anon_swap_free(struct anon *ap, page_t *pp)
1019 {
1020         kmutex_t *ahm;
1021
1022         ASSERT(ap != NULL);
1023         ASSERT(pp != NULL);
1024         ASSERT(PAGE_LOCKED(pp));
1025         VERIFY(pp->p_object != NULL);
1026         ASSERT(pp->p_vnode != NULL);
1027         ASSERT(IS_SWAPFSVP(pp->p_vnode));
1028         ASSERT(ap->an_refcnt != 0);
1029         VERIFY(pp->p_object == &ap->an_vp->v_object);
1030         ASSERT(pp->p_vnode == ap->an_vp);
1031         ASSERT(pp->p_offset == ap->an_off);
1032
1033         if (ap->an_pvp == NULL)
1034                 return;
1035
1036         page_io_lock(pp);
1037         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1038         mutex_enter(ahm);
1039
1040         ASSERT(ap->an_refcnt != 0);
1041         VERIFY(pp->p_object == &ap->an_vp->v_object);
1042         ASSERT(pp->p_vnode == ap->an_vp);
1043         ASSERT(pp->p_offset == ap->an_off);
1044
1045         if (ap->an_pvp != NULL) {
1046                 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1047                 ap->an_pvp = NULL;
1048                 ap->an_poff = 0;
1049                 mutex_exit(ahm);
1050                 hat_setmod(pp);
1051         } else {
1052                 mutex_exit(ahm);
1053         }
1054         page_io_unlock(pp);
1055 }
1056
1057 /*
1058  * Decrement the reference count of an anon page.
1059  * If reference count goes to zero, free it and
1060  * its associated page (if any).
1061  */
1062 void
1063 anon_decref(struct anon *ap)
1064 {
1065         page_t *pp;
1066         struct vnode *vp;
1067         anoff_t off;
1068         kmutex_t *ahm;
1069
1070         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1071         mutex_enter(ahm);
1072         ASSERT(ap->an_refcnt != 0);
1073         if (ap->an_refcnt == 0)
1074                 panic("anon_decref: slot count 0");
1075         if (--ap->an_refcnt == 0) {
1076                 swap_xlate(ap, &vp, &off);
1077                 anon_rmhash(ap);
1078                 if (ap->an_pvp != NULL)
1079                         swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1080                 mutex_exit(ahm);
1081
1082                 /*
1083                  * If there is a page for this anon slot we will need to
1084                  * call VN_DISPOSE to get rid of the vp association and
1085                  * put the page back on the free list as really free.
1086                  * Acquire the "exclusive" lock to ensure that any
1087                  * pending i/o always completes before the swap slot
1088                  * is freed.
1089                  */
1090                 pp = page_lookup(&vp->v_object, (uoff_t)off, SE_EXCL);
1091                 if (pp != NULL) {
1092                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
1093                 }
1094                 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1095                     (void *)ap, (void *)ap->an_vp));
1096
1097                 kmem_cache_free(anon_cache, ap);
1098
1099                 ANI_ADD(1);
1100         } else {
1101                 mutex_exit(ahm);
1102         }
1103 }
1104
1105
1106 /*
1107  * check an_refcnt of the root anon slot (anon_index argument is aligned at
1108  * seg->s_szc level) to determine whether COW processing is required.
1109  * anonpages_hash_lock[] held on the root ap ensures that if root's
1110  * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1111  * later since this process can't fork while its AS lock is held).
1112  *
1113  * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1114  */
1115 int
1116 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1117 {
1118         struct anon     *ap;
1119         kmutex_t        *ahmpages = NULL;
1120
1121         ap = anon_get_ptr(ahp, anon_index);
1122         if (ap == NULL)
1123                 return (0);
1124
1125         ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1126         mutex_enter(ahmpages);
1127         ASSERT(ap->an_refcnt >= 1);
1128         if (ap->an_refcnt == 1) {
1129                 mutex_exit(ahmpages);
1130                 return (0);
1131         }
1132         mutex_exit(ahmpages);
1133         return (1);
1134 }
1135 /*
1136  * Check 'nslots' anon slots for refcnt > 1.
1137  *
1138  * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1139  * returns 0.
1140  */
1141 static int
1142 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1143 {
1144         struct anon *ap;
1145
1146         while (nslots-- > 0) {
1147                 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1148                     ap->an_refcnt > 1)
1149                         return (1);
1150                 anon_index++;
1151         }
1152
1153         return (0);
1154 }
1155
1156 static void
1157 anon_decref_pages(
1158         struct anon_hdr *ahp,
1159         ulong_t an_idx,
1160         uint_t szc)
1161 {
1162         struct anon *ap = anon_get_ptr(ahp, an_idx);
1163         kmutex_t *ahmpages = NULL;
1164         page_t *pp;
1165         pgcnt_t pgcnt = page_get_pagecnt(szc);
1166         pgcnt_t i;
1167         struct vnode *vp;
1168         anoff_t   off;
1169         kmutex_t *ahm;
1170 #ifdef DEBUG
1171         int refcnt = 1;
1172 #endif
1173
1174         ASSERT(szc != 0);
1175         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1176         ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1177         ASSERT(an_idx < ahp->size);
1178
1179         if (ahp->size - an_idx < pgcnt) {
1180                 /*
1181                  * In case of shared mappings total anon map size may not be
1182                  * the largest page size aligned.
1183                  */
1184                 pgcnt = ahp->size - an_idx;
1185         }
1186
1187         VM_STAT_ADD(anonvmstats.decrefpages[0]);
1188
1189         if (ap != NULL) {
1190                 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1191                 mutex_enter(ahmpages);
1192                 ASSERT((refcnt = ap->an_refcnt) != 0);
1193                 VM_STAT_ADD(anonvmstats.decrefpages[1]);
1194                 if (ap->an_refcnt == 1) {
1195                         VM_STAT_ADD(anonvmstats.decrefpages[2]);
1196                         ASSERT(!anon_share(ahp, an_idx, pgcnt));
1197                         mutex_exit(ahmpages);
1198                         ahmpages = NULL;
1199                 }
1200         }
1201
1202         i = 0;
1203         while (i < pgcnt) {
1204                 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1205                         ASSERT(refcnt == 1 && ahmpages == NULL);
1206                         i++;
1207                         continue;
1208                 }
1209                 ASSERT(ap->an_refcnt == refcnt);
1210                 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1211                 ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1212
1213                 if (ahmpages == NULL) {
1214                         swap_xlate(ap, &vp, &off);
1215                         pp = page_lookup(&vp->v_object, (uoff_t)off, SE_EXCL);
1216                         if (pp == NULL || pp->p_szc == 0) {
1217                                 VM_STAT_ADD(anonvmstats.decrefpages[3]);
1218                                 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1219                                 (void) anon_set_ptr(ahp, an_idx + i, NULL,
1220                                     ANON_SLEEP);
1221                                 mutex_enter(ahm);
1222                                 ap->an_refcnt--;
1223                                 ASSERT(ap->an_refcnt == 0);
1224                                 anon_rmhash(ap);
1225                                 if (ap->an_pvp)
1226                                         swap_phys_free(ap->an_pvp, ap->an_poff,
1227                                             PAGESIZE);
1228                                 mutex_exit(ahm);
1229                                 if (pp == NULL) {
1230                                         pp = page_lookup(&vp->v_object,
1231                                                          (uoff_t)off, SE_EXCL);
1232                                         ASSERT(pp == NULL || pp->p_szc == 0);
1233                                 }
1234                                 if (pp != NULL) {
1235                                         VM_STAT_ADD(anonvmstats.decrefpages[4]);
1236                                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
1237                                 }
1238                                 kmem_cache_free(anon_cache, ap);
1239                                 ANI_ADD(1);
1240                                 i++;
1241                         } else {
1242                                 pgcnt_t j;
1243                                 pgcnt_t curpgcnt =
1244                                     page_get_pagecnt(pp->p_szc);
1245                                 size_t ppasize = curpgcnt * sizeof (page_t *);
1246                                 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1247                                 int dispose = 0;
1248
1249                                 VM_STAT_ADD(anonvmstats.decrefpages[5]);
1250
1251                                 ASSERT(pp->p_szc <= szc);
1252                                 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1253                                 ASSERT(IS_P2ALIGNED(i, curpgcnt));
1254                                 ASSERT(i + curpgcnt <= pgcnt);
1255                                 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1256                                 ppa[0] = pp;
1257                                 for (j = i + 1; j < i + curpgcnt; j++) {
1258                                         ap = anon_get_ptr(ahp, an_idx + j);
1259                                         ASSERT(ap != NULL &&
1260                                             ap->an_refcnt == 1);
1261                                         swap_xlate(ap, &vp, &off);
1262                                         pp = page_lookup(&vp->v_object,
1263                                                          (uoff_t)off, SE_EXCL);
1264                                         if (pp == NULL)
1265                                                 panic("anon_decref_pages: "
1266                                                     "no page");
1267
1268                                         (void) hat_pageunload(pp,
1269                                             HAT_FORCE_PGUNLOAD);
1270                                         ASSERT(pp->p_szc == ppa[0]->p_szc);
1271                                         ASSERT(page_pptonum(pp) - 1 ==
1272                                             page_pptonum(ppa[j - i - 1]));
1273                                         ppa[j - i] = pp;
1274                                         if (ap->an_pvp != NULL &&
1275                                             (ap->an_pvp->v_op->vop_dispose != fs_dispose &&
1276                                              ap->an_pvp->v_op->vop_dispose != NULL))
1277                                                 dispose = 1;
1278                                 }
1279                                 for (j = i; j < i + curpgcnt; j++) {
1280                                         ap = anon_get_ptr(ahp, an_idx + j);
1281                                         ASSERT(ap != NULL &&
1282                                             ap->an_refcnt == 1);
1283                                         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1284                                         (void) anon_set_ptr(ahp, an_idx + j,
1285                                             NULL, ANON_SLEEP);
1286                                         mutex_enter(ahm);
1287                                         ap->an_refcnt--;
1288                                         ASSERT(ap->an_refcnt == 0);
1289                                         anon_rmhash(ap);
1290                                         if (ap->an_pvp)
1291                                                 swap_phys_free(ap->an_pvp,
1292                                                     ap->an_poff, PAGESIZE);
1293                                         mutex_exit(ahm);
1294                                         kmem_cache_free(anon_cache, ap);
1295                                         ANI_ADD(1);
1296                                 }
1297                                 if (!dispose) {
1298                                         VM_STAT_ADD(anonvmstats.decrefpages[6]);
1299                                         page_destroy_pages(ppa[0]);
1300                                 } else {
1301                                         VM_STAT_ADD(anonvmstats.decrefpages[7]);
1302                                         for (j = 0; j < curpgcnt; j++) {
1303                                                 ASSERT(PAGE_EXCL(ppa[j]));
1304                                                 ppa[j]->p_szc = 0;
1305                                         }
1306                                         for (j = 0; j < curpgcnt; j++) {
1307                                                 ASSERT(!hat_page_is_mapped(
1308                                                     ppa[j]));
1309                                                 VN_DISPOSE(ppa[j], B_INVAL, 0,
1310                                                     kcred);
1311                                         }
1312                                 }
1313                                 kmem_free(ppa, ppasize);
1314                                 i += curpgcnt;
1315                         }
1316                 } else {
1317                         VM_STAT_ADD(anonvmstats.decrefpages[8]);
1318                         (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1319                         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1320                         mutex_enter(ahm);
1321                         ap->an_refcnt--;
1322                         mutex_exit(ahm);
1323                         i++;
1324                 }
1325         }
1326
1327         if (ahmpages != NULL) {
1328                 mutex_exit(ahmpages);
1329         }
1330 }
1331
1332 /*
1333  * Duplicate references to size bytes worth of anon pages.
1334  * Used when duplicating a segment that contains private anon pages.
1335  * This code assumes that procedure calling this one has already used
1336  * hat_chgprot() to disable write access to the range of addresses that
1337  * that *old actually refers to.
1338  */
1339 void
1340 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1341                         ulong_t new_idx, size_t size)
1342 {
1343         spgcnt_t npages;
1344         kmutex_t *ahm;
1345         struct anon *ap;
1346         ulong_t off;
1347         ulong_t index;
1348
1349         npages = btopr(size);
1350         while (npages > 0) {
1351                 index = old_idx;
1352                 if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1353                         break;
1354
1355                 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1356                 off = index - old_idx;
1357                 npages -= off;
1358                 if (npages <= 0)
1359                         break;
1360
1361                 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1362                 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1363
1364                 mutex_enter(ahm);
1365                 ap->an_refcnt++;
1366                 mutex_exit(ahm);
1367
1368                 off++;
1369                 new_idx += off;
1370                 old_idx += off;
1371                 npages--;
1372         }
1373 }
1374
1375 /*
1376  * Just like anon_dup but also guarantees there are no holes (unallocated anon
1377  * slots) within any large page region. That means if a large page region is
1378  * empty in the old array it will skip it. If there are 1 or more valid slots
1379  * in the large page region of the old array it will make sure to fill in any
1380  * unallocated ones and also copy them to the new array. If noalloc is 1 large
1381  * page region should either have no valid anon slots or all slots should be
1382  * valid.
1383  */
1384 void
1385 anon_dup_fill_holes(
1386         struct anon_hdr *old,
1387         ulong_t old_idx,
1388         struct anon_hdr *new,
1389         ulong_t new_idx,
1390         size_t size,
1391         uint_t szc,
1392         int noalloc)
1393 {
1394         struct anon     *ap;
1395         spgcnt_t        npages;
1396         kmutex_t        *ahm, *ahmpages = NULL;
1397         pgcnt_t         pgcnt, i;
1398         ulong_t         index, off;
1399 #ifdef DEBUG
1400         int             refcnt;
1401 #endif
1402
1403         ASSERT(szc != 0);
1404         pgcnt = page_get_pagecnt(szc);
1405         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1406         npages = btopr(size);
1407         ASSERT(IS_P2ALIGNED(npages, pgcnt));
1408         ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1409
1410         VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1411
1412         while (npages > 0) {
1413                 index = old_idx;
1414
1415                 /*
1416                  * Find the next valid slot.
1417                  */
1418                 if (anon_get_next_ptr(old, &index) == NULL)
1419                         break;
1420
1421                 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1422                 /*
1423                  * Now backup index to the beginning of the
1424                  * current large page region of the old array.
1425                  */
1426                 index = P2ALIGN(index, pgcnt);
1427                 off = index - old_idx;
1428                 ASSERT(IS_P2ALIGNED(off, pgcnt));
1429                 npages -= off;
1430                 if (npages <= 0)
1431                         break;
1432
1433                 /*
1434                  * Fill and copy a large page regions worth
1435                  * of anon slots.
1436                  */
1437                 for (i = 0; i < pgcnt; i++) {
1438                         if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1439                                 if (noalloc) {
1440                                         panic("anon_dup_fill_holes: "
1441                                             "empty anon slot\n");
1442                                 }
1443                                 VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1444                                 ap = anon_alloc(NULL, 0);
1445                                 (void) anon_set_ptr(old, index + i, ap,
1446                                     ANON_SLEEP);
1447                         } else if (i == 0) {
1448                                 /*
1449                                  * make the increment of all refcnts of all
1450                                  * anon slots of a large page appear atomic by
1451                                  * getting an anonpages_hash_lock for the
1452                                  * first anon slot of a large page.
1453                                  */
1454                                 VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1455
1456                                 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1457                                 mutex_enter(ahmpages);
1458
1459                                 ASSERT(refcnt = ap->an_refcnt);
1460
1461                                 VM_STAT_COND_ADD(ap->an_refcnt > 1,
1462                                     anonvmstats.dupfillholes[3]);
1463                         }
1464                         (void) anon_set_ptr(new, new_idx + off + i, ap,
1465                             ANON_SLEEP);
1466                         ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1467                         mutex_enter(ahm);
1468                         ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1469                         ASSERT(i == 0 || ahmpages == NULL ||
1470                             refcnt == ap->an_refcnt);
1471                         ap->an_refcnt++;
1472                         mutex_exit(ahm);
1473                 }
1474                 if (ahmpages != NULL) {
1475                         mutex_exit(ahmpages);
1476                         ahmpages = NULL;
1477                 }
1478                 off += pgcnt;
1479                 new_idx += off;
1480                 old_idx += off;
1481                 npages -= pgcnt;
1482         }
1483 }
1484
1485 /*
1486  * Used when a segment with a vnode changes szc. similarly to
1487  * anon_dup_fill_holes() makes sure each large page region either has no anon
1488  * slots or all of them. but new slots are created by COWing the file
1489  * pages. on entrance no anon slots should be shared.
1490  */
1491 int
1492 anon_fill_cow_holes(
1493         struct seg *seg,
1494         caddr_t addr,
1495         struct anon_hdr *ahp,
1496         ulong_t an_idx,
1497         struct vnode *vp,
1498         uoff_t vp_off,
1499         size_t size,
1500         uint_t szc,
1501         uint_t prot,
1502         struct vpage vpage[],
1503         struct cred *cred)
1504 {
1505         struct anon     *ap;
1506         spgcnt_t        npages;
1507         pgcnt_t         pgcnt, i;
1508         ulong_t         index, off;
1509         int             err = 0;
1510         int             pageflags = 0;
1511
1512         ASSERT(szc != 0);
1513         pgcnt = page_get_pagecnt(szc);
1514         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1515         npages = btopr(size);
1516         ASSERT(IS_P2ALIGNED(npages, pgcnt));
1517         ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1518
1519         while (npages > 0) {
1520                 index = an_idx;
1521
1522                 /*
1523                  * Find the next valid slot.
1524                  */
1525                 if (anon_get_next_ptr(ahp, &index) == NULL) {
1526                         break;
1527                 }
1528
1529                 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1530                 /*
1531                  * Now backup index to the beginning of the
1532                  * current large page region of the anon array.
1533                  */
1534                 index = P2ALIGN(index, pgcnt);
1535                 off = index - an_idx;
1536                 ASSERT(IS_P2ALIGNED(off, pgcnt));
1537                 npages -= off;
1538                 if (npages <= 0)
1539                         break;
1540                 an_idx += off;
1541                 vp_off += ptob(off);
1542                 addr += ptob(off);
1543                 if (vpage != NULL) {
1544                         vpage += off;
1545                 }
1546
1547                 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1548                         if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1549                                 page_t *pl[1 + 1];
1550                                 page_t *pp;
1551
1552                                 err = fop_getpage(vp, vp_off, PAGESIZE, NULL,
1553                                     pl, PAGESIZE, seg, addr, S_READ, cred,
1554                                     NULL);
1555                                 if (err) {
1556                                         break;
1557                                 }
1558                                 if (vpage != NULL) {
1559                                         prot = VPP_PROT(vpage);
1560                                         pageflags = VPP_ISPPLOCK(vpage) ?
1561                                             LOCK_PAGE : 0;
1562                                 }
1563                                 pp = anon_private(&ap, seg, addr, prot, pl[0],
1564                                     pageflags, cred);
1565                                 if (pp == NULL) {
1566                                         err = ENOMEM;
1567                                         break;
1568                                 }
1569                                 (void) anon_set_ptr(ahp, an_idx, ap,
1570                                     ANON_SLEEP);
1571                                 page_unlock(pp);
1572                         }
1573                         ASSERT(ap->an_refcnt == 1);
1574                         addr += PAGESIZE;
1575                         if (vpage != NULL) {
1576                                 vpage++;
1577                         }
1578                 }
1579                 npages -= pgcnt;
1580         }
1581
1582         return (err);
1583 }
1584
1585 /*
1586  * Free a group of "size" anon pages, size in bytes,
1587  * and clear out the pointers to the anon entries.
1588  */
1589 void
1590 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1591 {
1592         spgcnt_t npages;
1593         struct anon *ap;
1594         ulong_t old;
1595
1596         npages = btopr(size);
1597
1598         while (npages > 0) {
1599                 old = index;
1600                 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1601                         break;
1602
1603                 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1604                 npages -= index - old;
1605                 if (npages <= 0)
1606                         break;
1607
1608                 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1609                 anon_decref(ap);
1610                 /*
1611                  * Bump index and decrement page count
1612                  */
1613                 index++;
1614                 npages--;
1615         }
1616 }
1617
1618 void
1619 anon_free_pages(
1620         struct anon_hdr *ahp,
1621         ulong_t an_idx,
1622         size_t size,
1623         uint_t szc)
1624 {
1625         spgcnt_t        npages;
1626         pgcnt_t         pgcnt;
1627         ulong_t         index, off;
1628
1629         ASSERT(szc != 0);
1630         pgcnt = page_get_pagecnt(szc);
1631         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1632         npages = btopr(size);
1633         ASSERT(IS_P2ALIGNED(npages, pgcnt));
1634         ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1635         ASSERT(an_idx < ahp->size);
1636
1637         VM_STAT_ADD(anonvmstats.freepages[0]);
1638
1639         while (npages > 0) {
1640                 index = an_idx;
1641
1642                 /*
1643                  * Find the next valid slot.
1644                  */
1645                 if (anon_get_next_ptr(ahp, &index) == NULL)
1646                         break;
1647
1648                 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1649                 /*
1650                  * Now backup index to the beginning of the
1651                  * current large page region of the old array.
1652                  */
1653                 index = P2ALIGN(index, pgcnt);
1654                 off = index - an_idx;
1655                 ASSERT(IS_P2ALIGNED(off, pgcnt));
1656                 npages -= off;
1657                 if (npages <= 0)
1658                         break;
1659
1660                 anon_decref_pages(ahp, index, szc);
1661
1662                 off += pgcnt;
1663                 an_idx += off;
1664                 npages -= pgcnt;
1665         }
1666 }
1667
1668 /*
1669  * Make anonymous pages discardable
1670  */
1671 int
1672 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size,
1673     uint_t behav, pgcnt_t *purged)
1674 {
1675         spgcnt_t npages = btopr(size);
1676         struct anon *ap;
1677         struct vnode *vp;
1678         anoff_t off;
1679         page_t *pp, *root_pp;
1680         kmutex_t *ahm;
1681         pgcnt_t pgcnt, npurged = 0;
1682         ulong_t old_idx, idx, i;
1683         struct anon_hdr *ahp = amp->ahp;
1684         anon_sync_obj_t cookie;
1685         int err = 0;
1686
1687         VERIFY(behav == MADV_FREE || behav == MADV_PURGE);
1688         ASSERT(RW_READ_HELD(&amp->a_rwlock));
1689         pgcnt = 1;
1690         for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1691             P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1692
1693                 /*
1694                  * get anon pointer and index for the first valid entry
1695                  * in the anon list, starting from "index"
1696                  */
1697                 old_idx = index;
1698                 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1699                         break;
1700
1701                 /*
1702                  * decrement npages by number of NULL anon slots we skipped
1703                  */
1704                 npages -= index - old_idx;
1705                 if (npages <= 0)
1706                         break;
1707
1708                 anon_array_enter(amp, index, &cookie);
1709                 ap = anon_get_ptr(ahp, index);
1710                 ASSERT(ap != NULL);
1711
1712                 /*
1713                  * Get anonymous page and try to lock it SE_EXCL;
1714                  * if we couldn't grab the lock we skip to next page.
1715                  */
1716                 swap_xlate(ap, &vp, &off);
1717                 pp = page_lookup_nowait(&vp->v_object, (uoff_t)off, SE_EXCL);
1718                 if (pp == NULL) {
1719                         segadvstat.MADV_FREE_miss.value.ul++;
1720                         pgcnt = 1;
1721                         anon_array_exit(&cookie);
1722                         continue;
1723                 }
1724                 pgcnt = page_get_pagecnt(pp->p_szc);
1725
1726                 /*
1727                  * we cannot free a page which is permanently locked.
1728                  * The page_struct_lock need not be acquired to examine
1729                  * these fields since the page has an "exclusive" lock.
1730                  */
1731                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1732                         page_unlock(pp);
1733                         segadvstat.MADV_FREE_miss.value.ul++;
1734                         anon_array_exit(&cookie);
1735                         err = EBUSY;
1736                         continue;
1737                 }
1738
1739                 ahm = AH_MUTEX(vp, off);
1740                 mutex_enter(ahm);
1741                 ASSERT(ap->an_refcnt != 0);
1742                 /*
1743                  * skip this one if copy-on-write is not yet broken.
1744                  */
1745                 if (ap->an_refcnt > 1) {
1746                         mutex_exit(ahm);
1747                         page_unlock(pp);
1748                         segadvstat.MADV_FREE_miss.value.ul++;
1749                         anon_array_exit(&cookie);
1750                         continue;
1751                 }
1752
1753                 if (behav == MADV_PURGE && pp->p_szc != 0) {
1754                         /*
1755                          * If we're purging and we have a large page, simplify
1756                          * things a bit by demoting ourselves into the base
1757                          * page case.
1758                          */
1759                         (void) page_try_demote_pages(pp);
1760                 }
1761
1762                 if (pp->p_szc == 0) {
1763                         pgcnt = 1;
1764
1765                         /*
1766                          * free swap slot;
1767                          */
1768                         if (ap->an_pvp) {
1769                                 swap_phys_free(ap->an_pvp, ap->an_poff,
1770                                     PAGESIZE);
1771                                 ap->an_pvp = NULL;
1772                                 ap->an_poff = 0;
1773                         }
1774
1775                         if (behav == MADV_PURGE) {
1776                                 /*
1777                                  * If we're purging (instead of merely freeing),
1778                                  * rip out this anon structure entirely to
1779                                  * assure that any subsequent fault pulls from
1780                                  * the backing vnode (if any).
1781                                  */
1782                                 if (--ap->an_refcnt == 0)
1783                                         anon_rmhash(ap);
1784
1785                                 mutex_exit(ahm);
1786                                 (void) anon_set_ptr(ahp, index,
1787                                     NULL, ANON_SLEEP);
1788                                 npurged++;
1789                                 ANI_ADD(1);
1790                                 kmem_cache_free(anon_cache, ap);
1791                         } else {
1792                                 mutex_exit(ahm);
1793                         }
1794
1795                         segadvstat.MADV_FREE_hit.value.ul++;
1796
1797                         /*
1798                          * while we are at it, unload all the translations
1799                          * and attempt to free the page.
1800                          */
1801                         (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1802
1803                         VN_DISPOSE(pp,
1804                             behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred);
1805
1806                         anon_array_exit(&cookie);
1807                         continue;
1808                 }
1809
1810                 pgcnt = page_get_pagecnt(pp->p_szc);
1811                 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1812                         if (!page_try_demote_pages(pp)) {
1813                                 mutex_exit(ahm);
1814                                 page_unlock(pp);
1815                                 segadvstat.MADV_FREE_miss.value.ul++;
1816                                 anon_array_exit(&cookie);
1817                                 err = EBUSY;
1818                                 continue;
1819                         } else {
1820                                 pgcnt = 1;
1821                                 if (ap->an_pvp) {
1822                                         swap_phys_free(ap->an_pvp,
1823                                             ap->an_poff, PAGESIZE);
1824                                         ap->an_pvp = NULL;
1825                                         ap->an_poff = 0;
1826                                 }
1827                                 mutex_exit(ahm);
1828                                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1829
1830                                 VN_DISPOSE(pp, B_FREE, 0, kcred);
1831                                 segadvstat.MADV_FREE_hit.value.ul++;
1832                                 anon_array_exit(&cookie);
1833                                 continue;
1834                         }
1835                 }
1836                 mutex_exit(ahm);
1837                 root_pp = pp;
1838
1839                 /*
1840                  * try to lock remaining pages
1841                  */
1842                 for (idx = 1; idx < pgcnt; idx++) {
1843                         pp++;
1844                         if (!page_trylock(pp, SE_EXCL))
1845                                 break;
1846                         if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1847                                 page_unlock(pp);
1848                                 break;
1849                         }
1850                 }
1851
1852                 if (idx == pgcnt) {
1853                         for (i = 0; i < pgcnt; i++) {
1854                                 ap = anon_get_ptr(ahp, index + i);
1855                                 if (ap == NULL)
1856                                         break;
1857                                 swap_xlate(ap, &vp, &off);
1858                                 ahm = AH_MUTEX(vp, off);
1859                                 mutex_enter(ahm);
1860                                 ASSERT(ap->an_refcnt != 0);
1861
1862                                 /*
1863                                  * skip this one if copy-on-write
1864                                  * is not yet broken.
1865                                  */
1866                                 if (ap->an_refcnt > 1) {
1867                                         mutex_exit(ahm);
1868                                         goto skiplp;
1869                                 }
1870                                 if (ap->an_pvp) {
1871                                         swap_phys_free(ap->an_pvp,
1872                                             ap->an_poff, PAGESIZE);
1873                                         ap->an_pvp = NULL;
1874                                         ap->an_poff = 0;
1875                                 }
1876                                 mutex_exit(ahm);
1877                         }
1878                         page_destroy_pages(root_pp);
1879                         segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1880                         anon_array_exit(&cookie);
1881                         continue;
1882                 }
1883 skiplp:
1884                 segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1885                 for (i = 0, pp = root_pp; i < idx; pp++, i++)
1886                         page_unlock(pp);
1887                 anon_array_exit(&cookie);
1888         }
1889
1890         if (purged != NULL)
1891                 *purged = npurged;
1892
1893         return (err);
1894 }
1895
1896 /*
1897  * Return the kept page(s) and protections back to the segment driver.
1898  */
1899 int
1900 anon_getpage(
1901         struct anon **app,
1902         uint_t *protp,
1903         page_t *pl[],
1904         size_t plsz,
1905         struct seg *seg,
1906         caddr_t addr,
1907         enum seg_rw rw,
1908         struct cred *cred)
1909 {
1910         page_t *pp;
1911         struct anon *ap = *app;
1912         struct vnode *vp;
1913         anoff_t off;
1914         int err;
1915         kmutex_t *ahm;
1916
1917         swap_xlate(ap, &vp, &off);
1918
1919         /*
1920          * Lookup the page. If page is being paged in,
1921          * wait for it to finish as we must return a list of
1922          * pages since this routine acts like the fop_getpage
1923          * routine does.
1924          */
1925         if (pl != NULL && (pp = page_lookup(&vp->v_object, (uoff_t)off, SE_SHARED))) {
1926                 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1927                 mutex_enter(ahm);
1928                 if (ap->an_refcnt == 1)
1929                         *protp = PROT_ALL;
1930                 else
1931                         *protp = PROT_ALL & ~PROT_WRITE;
1932                 mutex_exit(ahm);
1933                 pl[0] = pp;
1934                 pl[1] = NULL;
1935                 return (0);
1936         }
1937
1938         /*
1939          * Simply treat it as a vnode fault on the anon vp.
1940          */
1941
1942         err = fop_getpage(vp, (uoff_t)off, PAGESIZE, protp, pl, plsz,
1943             seg, addr, rw, cred, NULL);
1944
1945         if (err == 0 && pl != NULL) {
1946                 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1947                 mutex_enter(ahm);
1948                 if (ap->an_refcnt != 1)
1949                         *protp &= ~PROT_WRITE;  /* make read-only */
1950                 mutex_exit(ahm);
1951         }
1952         return (err);
1953 }
1954
1955 /*
1956  * Creates or returns kept pages to the segment driver.  returns -1 if a large
1957  * page cannot be allocated. returns -2 if some other process has allocated a
1958  * larger page.
1959  *
1960  * For cowfault it will allocate any size pages to fill the requested area to
1961  * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1962  * slots within a large page with other processes). This policy greatly
1963  * simplifies large page freeing (which is only freed when all anon slot
1964  * refcnts are 0).
1965  */
1966 int
1967 anon_map_getpages(
1968         struct anon_map *amp,
1969         ulong_t start_idx,
1970         uint_t  szc,
1971         struct seg *seg,
1972         caddr_t addr,
1973         uint_t prot,
1974         uint_t *protp,
1975         page_t  *ppa[],
1976         uint_t  *ppa_szc,
1977         struct vpage vpage[],
1978         enum seg_rw rw,
1979         int brkcow,
1980         int anypgsz,
1981         int pgflags,
1982         struct cred *cred)
1983 {
1984         pgcnt_t         pgcnt;
1985         struct anon     *ap;
1986         struct vnode    *vp;
1987         anoff_t         off;
1988         page_t          *pp, *pl[2], *conpp = NULL;
1989         caddr_t         vaddr;
1990         ulong_t         pg_idx, an_idx, i;
1991         spgcnt_t        nreloc = 0;
1992         int             prealloc = 1;
1993         int             err, slotcreate;
1994         uint_t          vpprot;
1995         int             upsize = (szc < seg->s_szc);
1996
1997 #if !defined(__i386) && !defined(__amd64)
1998         ASSERT(seg->s_szc != 0);
1999 #endif
2000         ASSERT(szc <= seg->s_szc);
2001         ASSERT(ppa_szc != NULL);
2002         ASSERT(rw != S_CREATE);
2003
2004         *protp = PROT_ALL;
2005
2006         VM_STAT_ADD(anonvmstats.getpages[0]);
2007
2008         if (szc == 0) {
2009                 VM_STAT_ADD(anonvmstats.getpages[1]);
2010                 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
2011                         err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
2012                             addr, rw, cred);
2013                         if (err)
2014                                 return (err);
2015                         ppa[0] = pl[0];
2016                         if (brkcow == 0 || (*protp & PROT_WRITE)) {
2017                                 VM_STAT_ADD(anonvmstats.getpages[2]);
2018                                 if (ppa[0]->p_szc != 0 && upsize) {
2019                                         VM_STAT_ADD(anonvmstats.getpages[3]);
2020                                         *ppa_szc = MIN(ppa[0]->p_szc,
2021                                             seg->s_szc);
2022                                         page_unlock(ppa[0]);
2023                                         return (-2);
2024                                 }
2025                                 return (0);
2026                         }
2027                         panic("anon_map_getpages: cowfault for szc 0");
2028                 } else {
2029                         VM_STAT_ADD(anonvmstats.getpages[4]);
2030                         ppa[0] = anon_zero(seg, addr, &ap, cred);
2031                         if (ppa[0] == NULL)
2032                                 return (ENOMEM);
2033                         (void) anon_set_ptr(amp->ahp, start_idx, ap,
2034                             ANON_SLEEP);
2035                         return (0);
2036                 }
2037         }
2038
2039         pgcnt = page_get_pagecnt(szc);
2040         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2041         ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2042
2043         /*
2044          * First we check for the case that the requtested large
2045          * page or larger page already exists in the system.
2046          * Actually we only check if the first constituent page
2047          * exists and only preallocate if it's not found.
2048          */
2049         ap = anon_get_ptr(amp->ahp, start_idx);
2050         if (ap) {
2051                 uint_t pszc;
2052                 swap_xlate(ap, &vp, &off);
2053                 if (page_exists_forreal(&vp->v_object, (uoff_t)off, &pszc)) {
2054                         if (pszc > szc && upsize) {
2055                                 *ppa_szc = MIN(pszc, seg->s_szc);
2056                                 return (-2);
2057                         }
2058                         if (pszc >= szc) {
2059                                 prealloc = 0;
2060                         }
2061                 }
2062         }
2063
2064         VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2065         VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2066
2067 top:
2068         /*
2069          * If a smaller page or no page at all was found,
2070          * grab a large page off the freelist.
2071          */
2072         if (prealloc) {
2073                 ASSERT(conpp == NULL);
2074                 if (page_alloc_pages(&anon_vp->v_object, seg, addr, NULL, ppa,
2075                     szc, 0, pgflags) != 0) {
2076                         VM_STAT_ADD(anonvmstats.getpages[7]);
2077                         if (brkcow == 0 || szc < seg->s_szc ||
2078                             !anon_szcshare(amp->ahp, start_idx)) {
2079                                 /*
2080                                  * If the refcnt's of all anon slots are <= 1
2081                                  * they can't increase since we are holding
2082                                  * the address space's lock. So segvn can
2083                                  * safely decrease szc without risking to
2084                                  * generate a cow fault for the region smaller
2085                                  * than the segment's largest page size.
2086                                  */
2087                                 VM_STAT_ADD(anonvmstats.getpages[8]);
2088                                 return (-1);
2089                         }
2090                 docow:
2091                         /*
2092                          * This is a cow fault. Copy away the entire 1 large
2093                          * page region of this segment.
2094                          */
2095                         if (szc != seg->s_szc)
2096                                 panic("anon_map_getpages: cowfault for szc %d",
2097                                     szc);
2098                         vaddr = addr;
2099                         for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2100                             pg_idx++, an_idx++, vaddr += PAGESIZE) {
2101                                 if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2102                                     NULL) {
2103                                         err = anon_getpage(&ap, &vpprot, pl,
2104                                             PAGESIZE, seg, vaddr, rw, cred);
2105                                         if (err) {
2106                                                 for (i = 0; i < pg_idx; i++) {
2107                                                         if ((pp = ppa[i]) !=
2108                                                             NULL)
2109                                                                 page_unlock(pp);
2110                                                 }
2111                                                 return (err);
2112                                         }
2113                                         ppa[pg_idx] = pl[0];
2114                                 } else {
2115                                         /*
2116                                          * Since this is a cowfault we know
2117                                          * that this address space has a
2118                                          * parent or children which means
2119                                          * anon_dup_fill_holes() has initialized
2120                                          * all anon slots within a large page
2121                                          * region that had at least one anon
2122                                          * slot at the time of fork().
2123                                          */
2124                                         panic("anon_map_getpages: "
2125                                             "cowfault but anon slot is empty");
2126                                 }
2127                         }
2128                         VM_STAT_ADD(anonvmstats.getpages[9]);
2129                         *protp = PROT_ALL;
2130                         return (anon_map_privatepages(amp, start_idx, szc, seg,
2131                             addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2132                 }
2133         }
2134
2135         VM_STAT_ADD(anonvmstats.getpages[10]);
2136
2137         an_idx = start_idx;
2138         pg_idx = 0;
2139         vaddr = addr;
2140         while (pg_idx < pgcnt) {
2141                 slotcreate = 0;
2142                 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2143                         VM_STAT_ADD(anonvmstats.getpages[11]);
2144                         /*
2145                          * For us to have decided not to preallocate
2146                          * would have meant that a large page
2147                          * was found. Which also means that all of the
2148                          * anon slots for that page would have been
2149                          * already created for us.
2150                          */
2151                         if (prealloc == 0)
2152                                 panic("anon_map_getpages: prealloc = 0");
2153
2154                         slotcreate = 1;
2155                         ap = anon_alloc(NULL, 0);
2156                 }
2157                 swap_xlate(ap, &vp, &off);
2158
2159                 /*
2160                  * Now setup our preallocated page to pass down
2161                  * to swap_getpage().
2162                  */
2163                 if (prealloc) {
2164                         ASSERT(ppa[pg_idx]->p_szc == szc);
2165                         conpp = ppa[pg_idx];
2166                 }
2167                 ASSERT(prealloc || conpp == NULL);
2168
2169                 /*
2170                  * If we just created this anon slot then call
2171                  * with S_CREATE to prevent doing IO on the page.
2172                  * Similar to the anon_zero case.
2173                  */
2174                 err = swap_getconpage(vp, (uoff_t)off, PAGESIZE,
2175                     NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2176                     slotcreate == 1 ? S_CREATE : rw, cred);
2177
2178                 if (err) {
2179                         ASSERT(err != -2 || upsize);
2180                         VM_STAT_ADD(anonvmstats.getpages[12]);
2181                         ASSERT(slotcreate == 0);
2182                         goto io_err;
2183                 }
2184
2185                 pp = pl[0];
2186
2187                 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2188                         VM_STAT_ADD(anonvmstats.getpages[13]);
2189                         ASSERT(slotcreate == 0);
2190                         ASSERT(prealloc == 0);
2191                         ASSERT(pg_idx == 0);
2192                         if (pp->p_szc > szc) {
2193                                 ASSERT(upsize);
2194                                 *ppa_szc = MIN(pp->p_szc, seg->s_szc);
2195                                 page_unlock(pp);
2196                                 VM_STAT_ADD(anonvmstats.getpages[14]);
2197                                 return (-2);
2198                         }
2199                         page_unlock(pp);
2200                         prealloc = 1;
2201                         goto top;
2202                 }
2203
2204                 /*
2205                  * If we decided to preallocate but fop_getpage
2206                  * found a page in the system that satisfies our
2207                  * request then free up our preallocated large page
2208                  * and continue looping accross the existing large
2209                  * page via fop_getpage.
2210                  */
2211                 if (prealloc && pp != ppa[pg_idx]) {
2212                         VM_STAT_ADD(anonvmstats.getpages[15]);
2213                         ASSERT(slotcreate == 0);
2214                         ASSERT(pg_idx == 0);
2215                         conpp = NULL;
2216                         prealloc = 0;
2217                         page_free_pages(ppa[0]);
2218                 }
2219
2220                 if (prealloc && nreloc > 1) {
2221                         /*
2222                          * we have relocated out of a smaller large page.
2223                          * skip npgs - 1 iterations and continue which will
2224                          * increment by one the loop indices.
2225                          */
2226                         spgcnt_t npgs = nreloc;
2227
2228                         VM_STAT_ADD(anonvmstats.getpages[16]);
2229
2230                         ASSERT(pp == ppa[pg_idx]);
2231                         ASSERT(slotcreate == 0);
2232                         ASSERT(pg_idx + npgs <= pgcnt);
2233                         if ((*protp & PROT_WRITE) &&
2234                             anon_share(amp->ahp, an_idx, npgs)) {
2235                                 *protp &= ~PROT_WRITE;
2236                         }
2237                         pg_idx += npgs;
2238                         an_idx += npgs;
2239                         vaddr += PAGESIZE * npgs;
2240                         continue;
2241                 }
2242
2243                 VM_STAT_ADD(anonvmstats.getpages[17]);
2244
2245                 /*
2246                  * Anon_zero case.
2247                  */
2248                 if (slotcreate) {
2249                         ASSERT(prealloc);
2250                         pagezero(pp, 0, PAGESIZE);
2251                         CPU_STATS_ADD_K(vm, zfod, 1);
2252                         hat_setrefmod(pp);
2253                 }
2254
2255                 ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2256                 ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2257                 ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2258
2259                 if (pg_idx > 0 &&
2260                     ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2261                     (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2262                         panic("anon_map_getpages: unexpected page");
2263                 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2264                         panic("anon_map_getpages: unaligned page");
2265                 }
2266
2267                 if (prealloc == 0) {
2268                         ppa[pg_idx] = pp;
2269                 }
2270
2271                 if (ap->an_refcnt > 1) {
2272                         VM_STAT_ADD(anonvmstats.getpages[18]);
2273                         *protp &= ~PROT_WRITE;
2274                 }
2275
2276                 /*
2277                  * If this is a new anon slot then initialize
2278                  * the anon array entry.
2279                  */
2280                 if (slotcreate) {
2281                         (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2282                 }
2283                 pg_idx++;
2284                 an_idx++;
2285                 vaddr += PAGESIZE;
2286         }
2287
2288         /*
2289          * Since preallocated pages come off the freelist
2290          * they are locked SE_EXCL. Simply downgrade and return.
2291          */
2292         if (prealloc) {
2293                 VM_STAT_ADD(anonvmstats.getpages[19]);
2294                 conpp = NULL;
2295                 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2296                         page_downgrade(ppa[pg_idx]);
2297                 }
2298         }
2299         ASSERT(conpp == NULL);
2300
2301         if (brkcow == 0 || (*protp & PROT_WRITE)) {
2302                 VM_STAT_ADD(anonvmstats.getpages[20]);
2303                 return (0);
2304         }
2305
2306         if (szc < seg->s_szc)
2307                 panic("anon_map_getpages: cowfault for szc %d", szc);
2308
2309         VM_STAT_ADD(anonvmstats.getpages[21]);
2310
2311         *protp = PROT_ALL;
2312         return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2313             ppa, vpage, anypgsz, pgflags, cred));
2314 io_err:
2315         /*
2316          * We got an IO error somewhere in our large page.
2317          * If we were using a preallocated page then just demote
2318          * all the constituent pages that we've succeeded with sofar
2319          * to PAGESIZE pages and leave them in the system
2320          * unlocked.
2321          */
2322
2323         ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2324
2325         VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2326         VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2327         VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2328
2329         if (prealloc) {
2330                 conpp = NULL;
2331                 if (pg_idx > 0) {
2332                         VM_STAT_ADD(anonvmstats.getpages[25]);
2333                         for (i = 0; i < pgcnt; i++) {
2334                                 pp = ppa[i];
2335                                 ASSERT(PAGE_EXCL(pp));
2336                                 ASSERT(pp->p_szc == szc);
2337                                 pp->p_szc = 0;
2338                         }
2339                         for (i = 0; i < pg_idx; i++) {
2340                                 ASSERT(!hat_page_is_mapped(ppa[i]));
2341                                 page_unlock(ppa[i]);
2342                         }
2343                         /*
2344                          * Now free up the remaining unused constituent
2345                          * pages.
2346                          */
2347                         while (pg_idx < pgcnt) {
2348                                 ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2349                                 page_free(ppa[pg_idx], 0);
2350                                 pg_idx++;
2351                         }
2352                 } else {
2353                         VM_STAT_ADD(anonvmstats.getpages[26]);
2354                         page_free_pages(ppa[0]);
2355                 }
2356         } else {
2357                 VM_STAT_ADD(anonvmstats.getpages[27]);
2358                 ASSERT(err > 0);
2359                 for (i = 0; i < pg_idx; i++)
2360                         page_unlock(ppa[i]);
2361         }
2362         ASSERT(conpp == NULL);
2363         if (err != -1)
2364                 return (err);
2365         /*
2366          * we are here because we failed to relocate.
2367          */
2368         ASSERT(prealloc);
2369         if (brkcow == 0 || szc < seg->s_szc ||
2370             !anon_szcshare(amp->ahp, start_idx)) {
2371                 VM_STAT_ADD(anonvmstats.getpages[28]);
2372                 return (-1);
2373         }
2374         VM_STAT_ADD(anonvmstats.getpages[29]);
2375         goto docow;
2376 }
2377
2378
2379 /*
2380  * Turn a reference to an object or shared anon page
2381  * into a private page with a copy of the data from the
2382  * original page which is always locked by the caller.
2383  * This routine unloads the translation and unlocks the
2384  * original page, if it isn't being stolen, before returning
2385  * to the caller.
2386  *
2387  * NOTE:  The original anon slot is not freed by this routine
2388  *        It must be freed by the caller while holding the
2389  *        "anon_map" lock to prevent races which can occur if
2390  *        a process has multiple lwps in its address space.
2391  */
2392 page_t *
2393 anon_private(
2394         struct anon **app,
2395         struct seg *seg,
2396         caddr_t addr,
2397         uint_t  prot,
2398         page_t *opp,
2399         int oppflags,
2400         struct cred *cred)
2401 {
2402         struct anon *old = *app;
2403         struct anon *new;
2404         page_t *pp = NULL;
2405         struct vnode *vp;
2406         anoff_t off;
2407         page_t *anon_pl[1 + 1];
2408         int err;
2409
2410         if (oppflags & STEAL_PAGE)
2411                 ASSERT(PAGE_EXCL(opp));
2412         else
2413                 ASSERT(PAGE_LOCKED(opp));
2414
2415         CPU_STATS_ADD_K(vm, cow_fault, 1);
2416
2417         *app = new = anon_alloc(NULL, 0);
2418         swap_xlate(new, &vp, &off);
2419
2420         if (oppflags & STEAL_PAGE) {
2421                 page_rename(opp, &vp->v_object, (uoff_t)off);
2422                 pp = opp;
2423                 hat_setmod(pp);
2424
2425                 /* bug 4026339 */
2426                 page_downgrade(pp);
2427                 return (pp);
2428         }
2429
2430         /*
2431          * Call the fop_getpage routine to create the page, thereby
2432          * enabling the vnode driver to allocate any filesystem
2433          * space (e.g., disk block allocation for UFS).  This also
2434          * prevents more than one page from being added to the
2435          * vnode at the same time.
2436          */
2437         err = fop_getpage(vp, (uoff_t)off, PAGESIZE, NULL,
2438             anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2439         if (err)
2440                 goto out;
2441
2442         pp = anon_pl[0];
2443
2444         /*
2445          * If the original page was locked, we need to move the lock
2446          * to the new page by transfering 'cowcnt/lckcnt' of the original
2447          * page to 'cowcnt/lckcnt' of the new page.
2448          *
2449          * See Statement at the beginning of segvn_lockop() and
2450          * comments in page_pp_useclaim() regarding the way
2451          * cowcnts/lckcnts are handled.
2452          *
2453          * Also availrmem must be decremented up front for read only mapping
2454          * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2455          * if availrmem did not need to be decremented after all.
2456          */
2457         if (oppflags & LOCK_PAGE) {
2458                 if ((prot & PROT_WRITE) == 0) {
2459                         mutex_enter(&freemem_lock);
2460                         if (availrmem > pages_pp_maximum) {
2461                                 availrmem--;
2462                                 pages_useclaim++;
2463                         } else {
2464                                 mutex_exit(&freemem_lock);
2465                                 goto out;
2466                         }
2467                         mutex_exit(&freemem_lock);
2468                 }
2469                 page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2470         }
2471
2472         /*
2473          * Now copy the contents from the original page,
2474          * which is locked and loaded in the MMU by
2475          * the caller to prevent yet another page fault.
2476          */
2477         /* XXX - should set mod bit in here */
2478         if (ppcopy(opp, pp) == 0) {
2479                 /*
2480                  * Before ppcopy could hanlde UE or other faults, we
2481                  * would have panicked here, and still have no option
2482                  * but to do so now.
2483                  */
2484                 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2485                     (void *)opp, (void *)pp);
2486         }
2487
2488         hat_setrefmod(pp);              /* mark as modified */
2489
2490         /*
2491          * Unload the old translation.
2492          */
2493         hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2494
2495         /*
2496          * Free unmapped, unmodified original page.
2497          * or release the lock on the original page,
2498          * otherwise the process will sleep forever in
2499          * anon_decref() waiting for the "exclusive" lock
2500          * on the page.
2501          */
2502         (void) page_release(opp, 1);
2503
2504         /*
2505          * we are done with page creation so downgrade the new
2506          * page's selock to shared, this helps when multiple
2507          * as_fault(...SOFTLOCK...) are done to the same
2508          * page(aio)
2509          */
2510         page_downgrade(pp);
2511
2512         /*
2513          * NOTE:  The original anon slot must be freed by the
2514          * caller while holding the "anon_map" lock, if we
2515          * copied away from an anonymous page.
2516          */
2517         return (pp);
2518
2519 out:
2520         *app = old;
2521         if (pp)
2522                 page_unlock(pp);
2523         anon_decref(new);
2524         page_unlock(opp);
2525         return (NULL);
2526 }
2527
2528 int
2529 anon_map_privatepages(
2530         struct anon_map *amp,
2531         ulong_t start_idx,
2532         uint_t  szc,
2533         struct seg *seg,
2534         caddr_t addr,
2535         uint_t  prot,
2536         page_t  *ppa[],
2537         struct vpage vpage[],
2538         int anypgsz,
2539         int pgflags,
2540         struct cred *cred)
2541 {
2542         pgcnt_t         pgcnt;
2543         struct vnode    *vp;
2544         anoff_t         off;
2545         page_t          *pl[2], *conpp = NULL;
2546         int             err;
2547         int             prealloc = 1;
2548         struct anon     *ap, *oldap;
2549         caddr_t         vaddr;
2550         page_t          *pplist, *pp;
2551         ulong_t         pg_idx, an_idx;
2552         spgcnt_t        nreloc = 0;
2553         int             pagelock = 0;
2554         kmutex_t        *ahmpages = NULL;
2555 #ifdef DEBUG
2556         int             refcnt;
2557 #endif
2558
2559         ASSERT(szc != 0);
2560         ASSERT(szc == seg->s_szc);
2561
2562         VM_STAT_ADD(anonvmstats.privatepages[0]);
2563
2564         pgcnt = page_get_pagecnt(szc);
2565         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2566         ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2567
2568         ASSERT(amp != NULL);
2569         ap = anon_get_ptr(amp->ahp, start_idx);
2570         ASSERT(ap == NULL || ap->an_refcnt >= 1);
2571
2572         VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2573
2574         /*
2575          * Now try and allocate the large page. If we fail then just
2576          * let fop_getpage give us PAGESIZE pages. Normally we let
2577          * the caller make this decision but to avoid added complexity
2578          * it's simplier to handle that case here.
2579          */
2580         if (anypgsz == -1) {
2581                 VM_STAT_ADD(anonvmstats.privatepages[2]);
2582                 prealloc = 0;
2583         } else if (page_alloc_pages(&anon_vp->v_object, seg, addr, &pplist,
2584             NULL, szc, anypgsz, pgflags) != 0) {
2585                 VM_STAT_ADD(anonvmstats.privatepages[3]);
2586                 prealloc = 0;
2587         }
2588
2589         /*
2590          * make the decrement of all refcnts of all
2591          * anon slots of a large page appear atomic by
2592          * getting an anonpages_hash_lock for the
2593          * first anon slot of a large page.
2594          */
2595         if (ap != NULL) {
2596                 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2597                 mutex_enter(ahmpages);
2598                 if (ap->an_refcnt == 1) {
2599                         VM_STAT_ADD(anonvmstats.privatepages[4]);
2600                         ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2601                         mutex_exit(ahmpages);
2602
2603                         if (prealloc) {
2604                                 page_free_replacement_page(pplist);
2605                                 page_create_putback(pgcnt);
2606                         }
2607                         ASSERT(ppa[0]->p_szc <= szc);
2608                         if (ppa[0]->p_szc == szc) {
2609                                 VM_STAT_ADD(anonvmstats.privatepages[5]);
2610                                 return (0);
2611                         }
2612                         for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2613                                 ASSERT(ppa[pg_idx] != NULL);
2614                                 page_unlock(ppa[pg_idx]);
2615                         }
2616                         return (-1);
2617                 }
2618         }
2619
2620         /*
2621          * If we are passed in the vpage array and this is
2622          * not PROT_WRITE then we need to decrement availrmem
2623          * up front before we try anything. If we need to and
2624          * can't decrement availrmem then its better to fail now
2625          * than in the middle of processing the new large page.
2626          * page_pp_usclaim() on behalf of each constituent page
2627          * below will adjust availrmem back for the cases not needed.
2628          */
2629         if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2630                 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2631                         if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2632                                 pagelock = 1;
2633                                 break;
2634                         }
2635                 }
2636                 if (pagelock) {
2637                         VM_STAT_ADD(anonvmstats.privatepages[6]);
2638                         mutex_enter(&freemem_lock);
2639                         if (availrmem >= pages_pp_maximum + pgcnt) {
2640                                 availrmem -= pgcnt;
2641                                 pages_useclaim += pgcnt;
2642                         } else {
2643                                 VM_STAT_ADD(anonvmstats.privatepages[7]);
2644                                 mutex_exit(&freemem_lock);
2645                                 if (ahmpages != NULL) {
2646                                         mutex_exit(ahmpages);
2647                                 }
2648                                 if (prealloc) {
2649                                         page_free_replacement_page(pplist);
2650                                         page_create_putback(pgcnt);
2651                                 }
2652                                 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2653                                         if (ppa[pg_idx] != NULL)
2654                                                 page_unlock(ppa[pg_idx]);
2655                                 return (ENOMEM);
2656                         }
2657                         mutex_exit(&freemem_lock);
2658                 }
2659         }
2660
2661         CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2662
2663         VM_STAT_ADD(anonvmstats.privatepages[8]);
2664
2665         an_idx = start_idx;
2666         pg_idx = 0;
2667         vaddr = addr;
2668         for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2669                 ASSERT(ppa[pg_idx] != NULL);
2670                 oldap = anon_get_ptr(amp->ahp, an_idx);
2671                 ASSERT(ahmpages != NULL || oldap == NULL);
2672                 ASSERT(ahmpages == NULL || oldap != NULL);
2673                 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2674                 ASSERT(ahmpages == NULL || pg_idx != 0 ||
2675                     (refcnt = oldap->an_refcnt));
2676                 ASSERT(ahmpages == NULL || pg_idx == 0 ||
2677                     refcnt == oldap->an_refcnt);
2678
2679                 ap = anon_alloc(NULL, 0);
2680
2681                 swap_xlate(ap, &vp, &off);
2682
2683                 /*
2684                  * Now setup our preallocated page to pass down to
2685                  * swap_getpage().
2686                  */
2687                 if (prealloc) {
2688                         pp = pplist;
2689                         page_sub(&pplist, pp);
2690                         conpp = pp;
2691                 }
2692
2693                 err = swap_getconpage(vp, (uoff_t)off, PAGESIZE, NULL, pl,
2694                     PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2695                     S_CREATE, cred);
2696
2697                 /*
2698                  * Impossible to fail this is S_CREATE.
2699                  */
2700                 if (err)
2701                         panic("anon_map_privatepages: fop_getpage failed");
2702
2703                 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2704                 ASSERT(prealloc == 0 || nreloc == 1);
2705
2706                 pp = pl[0];
2707
2708                 /*
2709                  * If the original page was locked, we need to move
2710                  * the lock to the new page by transfering
2711                  * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2712                  * of the new page. pg_idx can be used to index
2713                  * into the vpage array since the caller will guarentee
2714                  * that vpage struct passed in corresponds to addr
2715                  * and forward.
2716                  */
2717                 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2718                         page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2719                 } else if (pagelock) {
2720                         mutex_enter(&freemem_lock);
2721                         availrmem++;
2722                         pages_useclaim--;
2723                         mutex_exit(&freemem_lock);
2724                 }
2725
2726                 /*
2727                  * Now copy the contents from the original page.
2728                  */
2729                 if (ppcopy(ppa[pg_idx], pp) == 0) {
2730                         /*
2731                          * Before ppcopy could hanlde UE or other faults, we
2732                          * would have panicked here, and still have no option
2733                          * but to do so now.
2734                          */
2735                         panic("anon_map_privatepages, ppcopy failed");
2736                 }
2737
2738                 hat_setrefmod(pp);              /* mark as modified */
2739
2740                 /*
2741                  * Release the lock on the original page,
2742                  * derement the old slot, and down grade the lock
2743                  * on the new copy.
2744                  */
2745                 page_unlock(ppa[pg_idx]);
2746
2747                 if (!prealloc)
2748                         page_downgrade(pp);
2749
2750                 ppa[pg_idx] = pp;
2751
2752                 /*
2753                  * Now reflect the copy in the new anon array.
2754                  */
2755                 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2756                 if (oldap != NULL)
2757                         anon_decref(oldap);
2758                 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2759         }
2760
2761         /*
2762          * Unload the old large page translation.
2763          */
2764         hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2765
2766         if (ahmpages != NULL) {
2767                 mutex_exit(ahmpages);
2768         }
2769         ASSERT(prealloc == 0 || pplist == NULL);
2770         if (prealloc) {
2771                 VM_STAT_ADD(anonvmstats.privatepages[9]);
2772                 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2773                         page_downgrade(ppa[pg_idx]);
2774                 }
2775         }
2776
2777         return (0);
2778 }
2779
2780 /*
2781  * Allocate a private zero-filled anon page.
2782  */
2783 page_t *
2784 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2785 {
2786         struct anon *ap;
2787         page_t *pp;
2788         struct vnode *vp;
2789         anoff_t off;
2790         page_t *anon_pl[1 + 1];
2791         int err;
2792
2793         *app = ap = anon_alloc(NULL, 0);
2794         swap_xlate(ap, &vp, &off);
2795
2796         /*
2797          * Call the fop_getpage routine to create the page, thereby
2798          * enabling the vnode driver to allocate any filesystem
2799          * dependent structures (e.g., disk block allocation for UFS).
2800          * This also prevents more than on page from being added to
2801          * the vnode at the same time since it is locked.
2802          */
2803         err = fop_getpage(vp, off, PAGESIZE, NULL,
2804             anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2805         if (err) {
2806                 *app = NULL;
2807                 anon_decref(ap);
2808                 return (NULL);
2809         }
2810         pp = anon_pl[0];
2811
2812         pagezero(pp, 0, PAGESIZE);      /* XXX - should set mod bit */
2813         page_downgrade(pp);
2814         CPU_STATS_ADD_K(vm, zfod, 1);
2815         hat_setrefmod(pp);      /* mark as modified so pageout writes back */
2816         return (pp);
2817 }
2818
2819
2820 /*
2821  * Allocate array of private zero-filled anon pages for empty slots
2822  * and kept pages for non empty slots within given range.
2823  *
2824  * NOTE: This rontine will try and use large pages
2825  *      if available and supported by underlying platform.
2826  */
2827 int
2828 anon_map_createpages(
2829         struct anon_map *amp,
2830         ulong_t start_index,
2831         size_t len,
2832         page_t *ppa[],
2833         struct seg *seg,
2834         caddr_t addr,
2835         enum seg_rw rw,
2836         struct cred *cred)
2837 {
2838
2839         struct anon     *ap;
2840         struct vnode    *ap_vp;
2841         page_t          *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2842         int             err = 0;
2843         ulong_t         p_index, index;
2844         pgcnt_t         npgs, pg_cnt;
2845         spgcnt_t        nreloc = 0;
2846         uint_t          l_szc, szc, prot;
2847         anoff_t         ap_off;
2848         size_t          pgsz;
2849         lgrp_t          *lgrp;
2850         kmutex_t        *ahm;
2851
2852         /*
2853          * XXX For now only handle S_CREATE.
2854          */
2855         ASSERT(rw == S_CREATE);
2856
2857         index   = start_index;
2858         p_index = 0;
2859         npgs = btopr(len);
2860
2861         /*
2862          * If this platform supports multiple page sizes
2863          * then try and allocate directly from the free
2864          * list for pages larger than PAGESIZE.
2865          *
2866          * NOTE:When we have page_create_ru we can stop
2867          *      directly allocating from the freelist.
2868          */
2869         l_szc  = seg->s_szc;
2870         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
2871         while (npgs) {
2872
2873                 /*
2874                  * if anon slot already exists
2875                  *   (means page has been created)
2876                  * so 1) look up the page
2877                  *    2) if the page is still in memory, get it.
2878                  *    3) if not, create a page and
2879                  *        page in from physical swap device.
2880                  * These are done in anon_getpage().
2881                  */
2882                 ap = anon_get_ptr(amp->ahp, index);
2883                 if (ap) {
2884                         err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2885                             seg, addr, S_READ, cred);
2886                         if (err) {
2887                                 ANON_LOCK_EXIT(&amp->a_rwlock);
2888                                 panic("anon_map_createpages: anon_getpage");
2889                         }
2890                         pp = anon_pl[0];
2891                         ppa[p_index++] = pp;
2892
2893                         /*
2894                          * an_pvp can become non-NULL after SysV's page was
2895                          * paged out before ISM was attached to this SysV
2896                          * shared memory segment. So free swap slot if needed.
2897                          */
2898                         if (ap->an_pvp != NULL) {
2899                                 page_io_lock(pp);
2900                                 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2901                                 mutex_enter(ahm);
2902                                 if (ap->an_pvp != NULL) {
2903                                         swap_phys_free(ap->an_pvp,
2904                                             ap->an_poff, PAGESIZE);
2905                                         ap->an_pvp = NULL;
2906                                         ap->an_poff = 0;
2907                                         mutex_exit(ahm);
2908                                         hat_setmod(pp);
2909                                 } else {
2910                                         mutex_exit(ahm);
2911                                 }
2912                                 page_io_unlock(pp);
2913                         }
2914
2915                         addr += PAGESIZE;
2916                         index++;
2917                         npgs--;
2918                         continue;
2919                 }
2920                 /*
2921                  * Now try and allocate the largest page possible
2922                  * for the current address and range.
2923                  * Keep dropping down in page size until:
2924                  *
2925                  *      1) Properly aligned
2926                  *      2) Does not overlap existing anon pages
2927                  *      3) Fits in remaining range.
2928                  *      4) able to allocate one.
2929                  *
2930                  * NOTE: XXX When page_create_ru is completed this code
2931                  *       will change.
2932                  */
2933                 szc    = l_szc;
2934                 pplist = NULL;
2935                 pg_cnt = 0;
2936                 while (szc) {
2937                         pgsz    = page_get_pagesize(szc);
2938                         pg_cnt  = pgsz >> PAGESHIFT;
2939                         if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2940                             anon_pages(amp->ahp, index, pg_cnt) == 0) {
2941                                 /*
2942                                  * XXX
2943                                  * Since we are faking page_create()
2944                                  * we also need to do the freemem and
2945                                  * pcf accounting.
2946                                  */
2947                                 (void) page_create_wait(pg_cnt, PG_WAIT);
2948
2949                                 /*
2950                                  * Get lgroup to allocate next page of shared
2951                                  * memory from and use it to specify where to
2952                                  * allocate the physical memory
2953                                  */
2954                                 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2955
2956                                 pplist = page_get_freelist(
2957                                     &anon_vp->v_object, 0, seg,
2958                                     addr, pgsz, 0, lgrp);
2959
2960                                 if (pplist == NULL) {
2961                                         page_create_putback(pg_cnt);
2962                                 }
2963
2964                                 /*
2965                                  * If a request for a page of size
2966                                  * larger than PAGESIZE failed
2967                                  * then don't try that size anymore.
2968                                  */
2969                                 if (pplist == NULL) {
2970                                         l_szc = szc - 1;
2971                                 } else {
2972                                         break;
2973                                 }
2974                         }
2975                         szc--;
2976                 }
2977
2978                 /*
2979                  * If just using PAGESIZE pages then don't
2980                  * directly allocate from the free list.
2981                  */
2982                 if (pplist == NULL) {
2983                         ASSERT(szc == 0);
2984                         pp = anon_zero(seg, addr, &ap, cred);
2985                         if (pp == NULL) {
2986                                 ANON_LOCK_EXIT(&amp->a_rwlock);
2987                                 panic("anon_map_createpages: anon_zero");
2988                         }
2989                         ppa[p_index++] = pp;
2990
2991                         ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
2992                         (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
2993
2994                         addr += PAGESIZE;
2995                         index++;
2996                         npgs--;
2997                         continue;
2998                 }
2999
3000                 /*
3001                  * pplist is a list of pg_cnt PAGESIZE pages.
3002                  * These pages are locked SE_EXCL since they
3003                  * came directly off the free list.
3004                  */
3005                 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
3006                 ASSERT(IS_P2ALIGNED(index, pg_cnt));
3007                 ASSERT(conpp == NULL);
3008                 while (pg_cnt--) {
3009
3010                         ap = anon_alloc(NULL, 0);
3011                         swap_xlate(ap, &ap_vp, &ap_off);
3012
3013                         ASSERT(pplist != NULL);
3014                         pp = pplist;
3015                         page_sub(&pplist, pp);
3016                         PP_CLRFREE(pp);
3017                         PP_CLRAGED(pp);
3018                         conpp = pp;
3019
3020                         err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
3021                             (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
3022                             &nreloc, seg, addr, S_CREATE, cred);
3023
3024                         if (err) {
3025                                 ANON_LOCK_EXIT(&amp->a_rwlock);
3026                                 panic("anon_map_createpages: S_CREATE");
3027                         }
3028
3029                         ASSERT(anon_pl[0] == pp);
3030                         ASSERT(nreloc == 1);
3031                         pagezero(pp, 0, PAGESIZE);
3032                         CPU_STATS_ADD_K(vm, zfod, 1);
3033                         hat_setrefmod(pp);
3034
3035                         ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3036                         (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3037
3038                         ppa[p_index++] = pp;
3039
3040                         addr += PAGESIZE;
3041                         index++;
3042                         npgs--;
3043                 }
3044                 conpp = NULL;
3045                 pg_cnt  = pgsz >> PAGESHIFT;
3046                 p_index = p_index - pg_cnt;
3047                 while (pg_cnt--) {
3048                         page_downgrade(ppa[p_index++]);
3049                 }
3050         }
3051         ANON_LOCK_EXIT(&amp->a_rwlock);
3052         return (0);
3053 }
3054
3055 static int
3056 anon_try_demote_pages(
3057         struct anon_hdr *ahp,
3058         ulong_t sidx,
3059         uint_t szc,
3060         page_t **ppa,
3061         int private)
3062 {
3063         struct anon     *ap;
3064         pgcnt_t         pgcnt = page_get_pagecnt(szc);
3065         page_t          *pp;
3066         pgcnt_t         i;
3067         kmutex_t        *ahmpages = NULL;
3068         int             root = 0;
3069         pgcnt_t         npgs;
3070         pgcnt_t         curnpgs = 0;
3071         size_t          ppasize = 0;
3072
3073         ASSERT(szc != 0);
3074         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3075         ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3076         ASSERT(sidx < ahp->size);
3077
3078         if (ppa == NULL) {
3079                 ppasize = pgcnt * sizeof (page_t *);
3080                 ppa = kmem_alloc(ppasize, KM_SLEEP);
3081         }
3082
3083         ap = anon_get_ptr(ahp, sidx);
3084         if (ap != NULL && private) {
3085                 VM_STAT_ADD(anonvmstats.demotepages[1]);
3086                 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3087                 mutex_enter(ahmpages);
3088         }
3089
3090         if (ap != NULL && ap->an_refcnt > 1) {
3091                 if (ahmpages != NULL) {
3092                         VM_STAT_ADD(anonvmstats.demotepages[2]);
3093                         mutex_exit(ahmpages);
3094                 }
3095                 if (ppasize != 0) {
3096                         kmem_free(ppa, ppasize);
3097                 }
3098                 return (0);
3099         }
3100         if (ahmpages != NULL) {
3101                 mutex_exit(ahmpages);
3102         }
3103         if (ahp->size - sidx < pgcnt) {
3104                 ASSERT(private == 0);
3105                 pgcnt = ahp->size - sidx;
3106         }
3107         for (i = 0; i < pgcnt; i++, sidx++) {
3108                 ap = anon_get_ptr(ahp, sidx);
3109                 if (ap != NULL) {
3110                         if (ap->an_refcnt != 1) {
3111                                 panic("anon_try_demote_pages: an_refcnt != 1");
3112                         }
3113                         pp = ppa[i] = page_lookup(&ap->an_vp->v_object,
3114                                                     ap->an_off, SE_EXCL);
3115                         if (pp != NULL) {
3116                                 (void) hat_pageunload(pp,
3117                                     HAT_FORCE_PGUNLOAD);
3118                         }
3119                 } else {
3120                         ppa[i] = NULL;
3121                 }
3122         }
3123         for (i = 0; i < pgcnt; i++) {
3124                 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3125                         ASSERT(pp->p_szc <= szc);
3126                         if (!root) {
3127                                 VM_STAT_ADD(anonvmstats.demotepages[3]);
3128                                 if (curnpgs != 0)
3129                                         panic("anon_try_demote_pages: "
3130                                             "bad large page");
3131
3132                                 root = 1;
3133                                 curnpgs = npgs =
3134                                     page_get_pagecnt(pp->p_szc);
3135
3136                                 ASSERT(npgs <= pgcnt);
3137                                 ASSERT(IS_P2ALIGNED(npgs, npgs));
3138                                 ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3139                         } else {
3140                                 ASSERT(i > 0);
3141                                 ASSERT(page_pptonum(pp) - 1 ==
3142                                     page_pptonum(ppa[i - 1]));
3143                                 if ((page_pptonum(pp) & (npgs - 1)) ==
3144                                     npgs - 1)
3145                                         root = 0;
3146                         }
3147                         ASSERT(PAGE_EXCL(pp));
3148                         pp->p_szc = 0;
3149                         ASSERT(curnpgs > 0);
3150                         curnpgs--;
3151                 }
3152         }
3153         if (root != 0 || curnpgs != 0)
3154                 panic("anon_try_demote_pages: bad large page");
3155
3156         for (i = 0; i < pgcnt; i++) {
3157                 if ((pp = ppa[i]) != NULL) {
3158                         ASSERT(!hat_page_is_mapped(pp));
3159                         ASSERT(pp->p_szc == 0);
3160                         page_unlock(pp);
3161                 }
3162         }
3163         if (ppasize != 0) {
3164                 kmem_free(ppa, ppasize);
3165         }
3166         return (1);
3167 }
3168
3169 /*
3170  * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3171  */
3172 int
3173 anon_map_demotepages(
3174         struct anon_map *amp,
3175         ulong_t start_idx,
3176         struct seg *seg,
3177         caddr_t addr,
3178         uint_t prot,
3179         struct vpage vpage[],
3180         struct cred *cred)
3181 {
3182         struct anon     *ap;
3183         uint_t          szc = seg->s_szc;
3184         pgcnt_t         pgcnt = page_get_pagecnt(szc);
3185         size_t          ppasize = pgcnt * sizeof (page_t *);
3186         page_t          **ppa = kmem_alloc(ppasize, KM_SLEEP);
3187         page_t          *pp;
3188         page_t          *pl[2];
3189         pgcnt_t         i, pg_idx;
3190         ulong_t         an_idx;
3191         caddr_t         vaddr;
3192         int             err;
3193         int             retry = 0;
3194         uint_t          vpprot;
3195
3196         ASSERT(RW_WRITE_HELD(&amp->a_rwlock));
3197         ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3198         ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3199         ASSERT(ppa != NULL);
3200         ASSERT(szc != 0);
3201         ASSERT(szc == amp->a_szc);
3202
3203         VM_STAT_ADD(anonvmstats.demotepages[0]);
3204
3205 top:
3206         if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3207                 kmem_free(ppa, ppasize);
3208                 return (0);
3209         }
3210
3211         VM_STAT_ADD(anonvmstats.demotepages[4]);
3212
3213         ASSERT(retry == 0); /* we can be here only once */
3214
3215         vaddr = addr;
3216         for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3217             pg_idx++, an_idx++, vaddr += PAGESIZE) {
3218                 ap = anon_get_ptr(amp->ahp, an_idx);
3219                 if (ap == NULL)
3220                         panic("anon_map_demotepages: no anon slot");
3221                 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3222                     S_READ, cred);
3223                 if (err) {
3224                         for (i = 0; i < pg_idx; i++) {
3225                                 if ((pp = ppa[i]) != NULL)
3226                                         page_unlock(pp);
3227                         }
3228                         kmem_free(ppa, ppasize);
3229                         return (err);
3230                 }
3231                 ppa[pg_idx] = pl[0];
3232         }
3233
3234         err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3235             vpage, -1, 0, cred);
3236         if (err > 0) {
3237                 VM_STAT_ADD(anonvmstats.demotepages[5]);
3238                 kmem_free(ppa, ppasize);
3239                 return (err);
3240         }
3241         ASSERT(err == 0 || err == -1);
3242         if (err == -1) {
3243                 VM_STAT_ADD(anonvmstats.demotepages[6]);
3244                 retry = 1;
3245                 goto top;
3246         }
3247         for (i = 0; i < pgcnt; i++) {
3248                 ASSERT(ppa[i] != NULL);
3249                 if (ppa[i]->p_szc != 0)
3250                         retry = 1;
3251                 page_unlock(ppa[i]);
3252         }
3253         if (retry) {
3254                 VM_STAT_ADD(anonvmstats.demotepages[7]);
3255                 goto top;
3256         }
3257
3258         VM_STAT_ADD(anonvmstats.demotepages[8]);
3259
3260         kmem_free(ppa, ppasize);
3261
3262         return (0);
3263 }
3264
3265 /*
3266  * Free pages of shared anon map. It's assumed that anon maps don't share anon
3267  * structures with private anon maps. Therefore all anon structures should
3268  * have at most one reference at this point. This means underlying pages can
3269  * be exclusively locked and demoted or freed.  If not freeing the entire
3270  * large pages demote the ends of the region we free to be able to free
3271  * subpages. Page roots correspond to aligned index positions in anon map.
3272  */
3273 void
3274 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3275 {
3276         ulong_t eidx = sidx + btopr(len);
3277         pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3278         struct anon_hdr *ahp = amp->ahp;
3279         ulong_t tidx;
3280         size_t size;
3281         ulong_t sidx_aligned;
3282         ulong_t eidx_aligned;
3283
3284         ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3285         ASSERT(amp->refcnt <= 1);
3286         ASSERT(amp->a_szc > 0);
3287         ASSERT(eidx <= ahp->size);
3288         ASSERT(!anon_share(ahp, sidx, btopr(len)));
3289
3290         if (len == 0) { /* XXX */
3291                 return;
3292         }
3293
3294         sidx_aligned = P2ALIGN(sidx, pages);
3295         if (sidx_aligned != sidx ||
3296             (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3297                 if (!anon_try_demote_pages(ahp, sidx_aligned,
3298                     amp->a_szc, NULL, 0)) {
3299                         panic("anon_shmap_free_pages: demote failed");
3300                 }
3301                 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3302                     P2NPHASE(sidx, pages);
3303                 size <<= PAGESHIFT;
3304                 anon_free(ahp, sidx, size);
3305                 sidx = sidx_aligned + pages;
3306                 if (eidx <= sidx) {
3307                         return;
3308                 }
3309         }
3310         eidx_aligned = P2ALIGN(eidx, pages);
3311         if (sidx < eidx_aligned) {
3312                 anon_free_pages(ahp, sidx,
3313                     (eidx_aligned - sidx) << PAGESHIFT,
3314                     amp->a_szc);
3315                 sidx = eidx_aligned;
3316         }
3317         ASSERT(sidx == eidx_aligned);
3318         if (eidx == eidx_aligned) {
3319                 return;
3320         }
3321         tidx = eidx;
3322         if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3323             tidx - sidx < pages) {
3324                 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3325                         panic("anon_shmap_free_pages: demote failed");
3326                 }
3327                 size = (eidx - sidx) << PAGESHIFT;
3328                 anon_free(ahp, sidx, size);
3329         } else {
3330                 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3331         }
3332 }
3333
3334 /*
3335  * This routine should be called with amp's writer lock when there're no other
3336  * users of amp.  All pcache entries of this amp must have been already
3337  * inactivated. We must not drop a_rwlock here to prevent new users from
3338  * attaching to this amp.
3339  */
3340 void
3341 anonmap_purge(struct anon_map *amp)
3342 {
3343         ASSERT(ANON_WRITE_HELD(&amp->a_rwlock));
3344         ASSERT(amp->refcnt <= 1);
3345
3346         if (amp->a_softlockcnt != 0) {
3347                 seg_ppurge(NULL, amp, 0);
3348         }
3349
3350         /*
3351          * Since all pcache entries were already inactive before this routine
3352          * was called seg_ppurge() couldn't return while there're still
3353          * entries that can be found via the list anchored at a_phead. So we
3354          * can assert this list is empty now. a_softlockcnt may be still non 0
3355          * if asynchronous thread that manages pcache already removed pcache
3356          * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3357          * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3358          * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3359          * before shamp_reclaim() is done with it. a_purgemtx also taken by
3360          * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3361          * barrier that prevents anonmap_purge() to complete while
3362          * shamp_reclaim() may still be referencing this amp.
3363          */
3364         ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3365         ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3366
3367         mutex_enter(&amp->a_purgemtx);
3368         while (amp->a_softlockcnt != 0) {
3369                 ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3370                 ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3371                 amp->a_purgewait = 1;
3372                 cv_wait(&amp->a_purgecv, &amp->a_purgemtx);
3373         }
3374         mutex_exit(&amp->a_purgemtx);
3375
3376         ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3377         ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3378         ASSERT(amp->a_softlockcnt == 0);
3379 }
3380
3381 /*
3382  * Allocate and initialize an anon_map structure for seg
3383  * associating the given swap reservation with the new anon_map.
3384  */
3385 struct anon_map *
3386 anonmap_alloc(size_t size, size_t swresv, int flags)
3387 {
3388         struct anon_map *amp;
3389         int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3390
3391         amp = kmem_cache_alloc(anonmap_cache, kmflags);
3392         if (amp == NULL) {
3393                 ASSERT(kmflags == KM_NOSLEEP);
3394                 return (NULL);
3395         }
3396
3397         amp->ahp = anon_create(btopr(size), flags);
3398         if (amp->ahp == NULL) {
3399                 ASSERT(flags == ANON_NOSLEEP);
3400                 kmem_cache_free(anonmap_cache, amp);
3401                 return (NULL);
3402         }
3403         amp->refcnt = 1;
3404         amp->size = size;
3405         amp->swresv = swresv;
3406         amp->locality = 0;
3407         amp->a_szc = 0;
3408         amp->a_sp = NULL;
3409         amp->a_softlockcnt = 0;
3410         amp->a_purgewait = 0;
3411         amp->a_phead.p_lnext = &amp->a_phead;
3412         amp->a_phead.p_lprev = &amp->a_phead;
3413
3414         return (amp);
3415 }
3416
3417 void
3418 anonmap_free(struct anon_map *amp)
3419 {
3420         ASSERT(amp->ahp != NULL);
3421         ASSERT(amp->refcnt == 0);
3422         ASSERT(amp->a_softlockcnt == 0);
3423         ASSERT(amp->a_phead.p_lnext == &amp->a_phead);
3424         ASSERT(amp->a_phead.p_lprev == &amp->a_phead);
3425
3426         lgrp_shm_policy_fini(amp, NULL);
3427         anon_release(amp->ahp, btopr(amp->size));
3428         kmem_cache_free(anonmap_cache, amp);
3429 }
3430
3431 /*
3432  * Returns true if the app array has some empty slots.
3433  * The offp and lenp parameters are in/out parameters.  On entry
3434  * these values represent the starting offset and length of the
3435  * mapping.  When true is returned, these values may be modified
3436  * to be the largest range which includes empty slots.
3437  */
3438 int
3439 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, uoff_t *offp,
3440                                 size_t *lenp)
3441 {
3442         ulong_t i, el;
3443         ssize_t low, high;
3444         struct anon *ap;
3445
3446         low = -1;
3447         for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3448                 ap = anon_get_ptr(ahp, anon_idx);
3449                 if (ap == NULL) {
3450                         if (low == -1)
3451                                 low = i;
3452                         high = i;
3453                 }
3454         }
3455         if (low != -1) {
3456                 /*
3457                  * Found at least one non-anon page.
3458                  * Set up the off and len return values.
3459                  */
3460                 if (low != 0)
3461                         *offp += low;
3462                 *lenp = high - low + PAGESIZE;
3463                 return (1);
3464         }
3465         return (0);
3466 }
3467
3468 /*
3469  * Return a count of the number of existing anon pages in the anon array
3470  * app in the range (off, off+len). The array and slots must be guaranteed
3471  * stable by the caller.
3472  */
3473 pgcnt_t
3474 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3475 {
3476         pgcnt_t cnt = 0;
3477
3478         while (nslots-- > 0) {
3479                 if ((anon_get_ptr(ahp, anon_index)) != NULL)
3480                         cnt++;
3481                 anon_index++;
3482         }
3483         return (cnt);
3484 }
3485
3486 /*
3487  * Move reserved phys swap into memory swap (unreserve phys swap
3488  * and reserve mem swap by the same amount).
3489  * Used by segspt when it needs to lock reserved swap npages in memory
3490  */
3491 int
3492 anon_swap_adjust(pgcnt_t npages)
3493 {
3494         pgcnt_t unlocked_mem_swap;
3495
3496         mutex_enter(&anoninfo_lock);
3497
3498         ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3499         ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3500
3501         unlocked_mem_swap = k_anoninfo.ani_mem_resv
3502             - k_anoninfo.ani_locked_swap;
3503         if (npages > unlocked_mem_swap) {
3504                 spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3505
3506                 /*
3507                  * if there is not enough unlocked mem swap we take missing
3508                  * amount from phys swap and give it to mem swap
3509                  */
3510                 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3511                         mutex_exit(&anoninfo_lock);
3512                         return (ENOMEM);
3513                 }
3514
3515                 k_anoninfo.ani_mem_resv += adjusted_swap;
3516                 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3517                 k_anoninfo.ani_phys_resv -= adjusted_swap;
3518
3519                 ANI_ADD(adjusted_swap);
3520         }
3521         k_anoninfo.ani_locked_swap += npages;
3522
3523         ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3524         ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3525
3526         mutex_exit(&anoninfo_lock);
3527
3528         return (0);
3529 }
3530
3531 /*
3532  * 'unlocked' reserved mem swap so when it is unreserved it
3533  * can be moved back phys (disk) swap
3534  */
3535 void
3536 anon_swap_restore(pgcnt_t npages)
3537 {
3538         mutex_enter(&anoninfo_lock);
3539
3540         ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3541
3542         ASSERT(k_anoninfo.ani_locked_swap >= npages);
3543         k_anoninfo.ani_locked_swap -= npages;
3544
3545         ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3546
3547         mutex_exit(&anoninfo_lock);
3548 }
3549
3550 /*
3551  * Return the pointer from the list for a
3552  * specified anon index.
3553  */
3554 ulong_t *
3555 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3556 {
3557         struct anon     **app;
3558         void            **ppp;
3559
3560         ASSERT(an_idx < ahp->size);
3561
3562         /*
3563          * Single level case.
3564          */
3565         if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3566                 return ((ulong_t *)&ahp->array_chunk[an_idx]);
3567         } else {
3568
3569                 /*
3570                  * 2 level case.
3571                  */
3572                 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3573                 if (*ppp == NULL) {
3574                         mutex_enter(&ahp->serial_lock);
3575                         ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3576                         if (*ppp == NULL)
3577                                 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3578                         mutex_exit(&ahp->serial_lock);
3579                 }
3580                 app = *ppp;
3581                 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3582         }
3583 }
3584
3585 void
3586 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3587 {
3588         ulong_t         *ap_slot;
3589         kmutex_t        *mtx;
3590         kcondvar_t      *cv;
3591         int             hash;
3592
3593         /*
3594          * Use szc to determine anon slot(s) to appear atomic.
3595          * If szc = 0, then lock the anon slot and mark it busy.
3596          * If szc > 0, then lock the range of slots by getting the
3597          * anon_array_lock for the first anon slot, and mark only the
3598          * first anon slot busy to represent whole range being busy.
3599          */
3600
3601         ASSERT(RW_READ_HELD(&amp->a_rwlock));
3602         an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3603         hash = ANON_ARRAY_HASH(amp, an_idx);
3604         sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3605         sobj->sync_cv = cv = &anon_array_cv[hash];
3606         mutex_enter(mtx);
3607         ap_slot = anon_get_slot(amp->ahp, an_idx);
3608         while (ANON_ISBUSY(ap_slot))
3609                 cv_wait(cv, mtx);
3610         ANON_SETBUSY(ap_slot);
3611         sobj->sync_data = ap_slot;
3612         mutex_exit(mtx);
3613 }
3614
3615 void
3616 anon_array_exit(anon_sync_obj_t *sobj)
3617 {
3618         mutex_enter(sobj->sync_mutex);
3619         ASSERT(ANON_ISBUSY(sobj->sync_data));
3620         ANON_CLRBUSY(sobj->sync_data);
3621         if (CV_HAS_WAITERS(sobj->sync_cv))
3622                 cv_broadcast(sobj->sync_cv);
3623         mutex_exit(sobj->sync_mutex);
3624 }