usr/src/uts/common/vm/vm_seg.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2015, Joyent, Inc.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39
  40 /*
  41  * VM - segment management.
  42  */
  43
  44 #include <sys/types.h>
  45 #include <sys/inttypes.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/kmem.h>
  50 #include <sys/sysmacros.h>
  51 #include <sys/vmsystm.h>
  52 #include <sys/tuneable.h>
  53 #include <sys/debug.h>
  54 #include <sys/fs/swapnode.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/callb.h>
  57 #include <sys/mem_config.h>
  58 #include <sys/mman.h>
  59
  60 #include <vm/hat.h>
  61 #include <vm/as.h>
  62 #include <vm/seg.h>
  63 #include <vm/seg_kmem.h>
  64 #include <vm/seg_spt.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/anon.h>
  67
  68 /*
  69  * kstats for segment advise
  70  */
  71 segadvstat_t segadvstat = {
  72         { "MADV_FREE_hit",      KSTAT_DATA_ULONG },
  73         { "MADV_FREE_miss",     KSTAT_DATA_ULONG },
  74 };
  75
  76 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
  77 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
  78
  79 /*
  80  * entry in the segment page cache
  81  */
  82 struct seg_pcache {
  83         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
  84         struct seg_pcache       *p_hprev;
  85         pcache_link_t           p_plink;        /* per segment/amp list */
  86         void                    *p_htag0;       /* segment/amp pointer */
  87         caddr_t                 p_addr;         /* base address/anon_idx */
  88         size_t                  p_len;          /* total bytes */
  89         size_t                  p_wlen;         /* writtable bytes at p_addr */
  90         struct page             **p_pp;         /* pp shadow list */
  91         seg_preclaim_cbfunc_t   p_callback;     /* reclaim callback function */
  92         clock_t                 p_lbolt;        /* lbolt from last use */
  93         struct seg_phash        *p_hashp;       /* our pcache hash bucket */
  94         uint_t                  p_active;       /* active count */
  95         uchar_t                 p_write;        /* true if S_WRITE */
  96         uchar_t                 p_ref;          /* reference byte */
  97         ushort_t                p_flags;        /* bit flags */
  98 };
  99
 100 struct seg_phash {
 101         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 102         struct seg_pcache       *p_hprev;
 103         kmutex_t                p_hmutex;       /* protects hash bucket */
 104         pcache_link_t           p_halink[2];    /* active bucket linkages */
 105 };
 106
 107 struct seg_phash_wired {
 108         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 109         struct seg_pcache       *p_hprev;
 110         kmutex_t                p_hmutex;       /* protects hash bucket */
 111 };
 112
 113 /*
 114  * A parameter to control a maximum number of bytes that can be
 115  * purged from pcache at a time.
 116  */
 117 #define P_MAX_APURGE_BYTES      (1024 * 1024 * 1024)
 118
 119 /*
 120  * log2(fraction of pcache to reclaim at a time).
 121  */
 122 #define P_SHRINK_SHFT           (5)
 123
 124 /*
 125  * The following variables can be tuned via /etc/system.
 126  */
 127
 128 int     segpcache_enabled = 1;          /* if 1, shadow lists are cached */
 129 pgcnt_t segpcache_maxwindow = 0;        /* max # of pages that can be cached */
 130 ulong_t segpcache_hashsize_win = 0;     /* # of non wired buckets */
 131 ulong_t segpcache_hashsize_wired = 0;   /* # of wired buckets */
 132 int     segpcache_reap_sec = 1;         /* reap check rate in secs */
 133 clock_t segpcache_reap_ticks = 0;       /* reap interval in ticks */
 134 int     segpcache_pcp_maxage_sec = 1;   /* pcp max age in secs */
 135 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
 136 int     segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
 137 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
 138
 139 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
 140 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
 141 static kcondvar_t seg_pasync_cv;
 142
 143 #pragma align 64(pctrl1)
 144 #pragma align 64(pctrl2)
 145 #pragma align 64(pctrl3)
 146
 147 /*
 148  * Keep frequently used variables together in one cache line.
 149  */
 150 static struct p_ctrl1 {
 151         uint_t p_disabled;              /* if not 0, caching temporarily off */
 152         pgcnt_t p_maxwin;               /* max # of pages that can be cached */
 153         size_t p_hashwin_sz;            /* # of non wired buckets */
 154         struct seg_phash *p_htabwin;    /* hash table for non wired entries */
 155         size_t p_hashwired_sz;          /* # of wired buckets */
 156         struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
 157         kmem_cache_t *p_kmcache;        /* kmem cache for seg_pcache structs */
 158 #ifdef _LP64
 159         ulong_t pad[1];
 160 #endif /* _LP64 */
 161 } pctrl1;
 162
 163 static struct p_ctrl2 {
 164         kmutex_t p_mem_mtx;     /* protects window counter and p_halinks */
 165         pgcnt_t  p_locked_win;  /* # pages from window */
 166         pgcnt_t  p_locked;      /* # of pages cached by pagelock */
 167         uchar_t  p_ahcur;       /* current active links for insert/delete */
 168         uchar_t  p_athr_on;     /* async reclaim thread is running. */
 169         pcache_link_t p_ahhead[2]; /* active buckets linkages */
 170 } pctrl2;
 171
 172 static struct p_ctrl3 {
 173         clock_t p_pcp_maxage;           /* max pcp age in ticks */
 174         ulong_t p_athr_empty_ahb;       /* athread walk stats */
 175         ulong_t p_athr_full_ahb;        /* athread walk stats */
 176         pgcnt_t p_maxapurge_npages;     /* max pages to purge at a time */
 177         int     p_shrink_shft;          /* reap shift factor */
 178 #ifdef _LP64
 179         ulong_t pad[3];
 180 #endif /* _LP64 */
 181 } pctrl3;
 182
 183 #define seg_pdisabled                   pctrl1.p_disabled
 184 #define seg_pmaxwindow                  pctrl1.p_maxwin
 185 #define seg_phashsize_win               pctrl1.p_hashwin_sz
 186 #define seg_phashtab_win                pctrl1.p_htabwin
 187 #define seg_phashsize_wired             pctrl1.p_hashwired_sz
 188 #define seg_phashtab_wired              pctrl1.p_htabwired
 189 #define seg_pkmcache                    pctrl1.p_kmcache
 190 #define seg_pmem_mtx                    pctrl2.p_mem_mtx
 191 #define seg_plocked_window              pctrl2.p_locked_win
 192 #define seg_plocked                     pctrl2.p_locked
 193 #define seg_pahcur                      pctrl2.p_ahcur
 194 #define seg_pathr_on                    pctrl2.p_athr_on
 195 #define seg_pahhead                     pctrl2.p_ahhead
 196 #define seg_pmax_pcpage                 pctrl3.p_pcp_maxage
 197 #define seg_pathr_empty_ahb             pctrl3.p_athr_empty_ahb
 198 #define seg_pathr_full_ahb              pctrl3.p_athr_full_ahb
 199 #define seg_pshrink_shift               pctrl3.p_shrink_shft
 200 #define seg_pmaxapurge_npages           pctrl3.p_maxapurge_npages
 201
 202 #define P_HASHWIN_MASK                  (seg_phashsize_win - 1)
 203 #define P_HASHWIRED_MASK                (seg_phashsize_wired - 1)
 204 #define P_BASESHIFT                     (6)
 205
 206 kthread_t *seg_pasync_thr;
 207
 208 extern struct seg_ops segvn_ops;
 209 extern struct seg_ops segspt_shmops;
 210
 211 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
 212 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
 213
 214 #define LBOLT_DELTA(t)  ((ulong_t)(ddi_get_lbolt() - (t)))
 215
 216 #define PCP_AGE(pcp)    LBOLT_DELTA((pcp)->p_lbolt)
 217
 218 /*
 219  * htag0 argument can be a seg or amp pointer.
 220  */
 221 #define P_HASHBP(seg, htag0, addr, flags)                               \
 222         (IS_PFLAGS_WIRED((flags)) ?                                     \
 223             ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
 224             ((uintptr_t)(htag0) >> P_BASESHIFT)]) :                     \
 225             (&seg_phashtab_win[P_HASHWIN_MASK &                         \
 226             (((uintptr_t)(htag0) >> 3) ^                                \
 227             ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?              \
 228             (flags >> 16) : page_get_shift((seg)->s_szc))))]))
 229
 230 /*
 231  * htag0 argument can be a seg or amp pointer.
 232  */
 233 #define P_MATCH(pcp, htag0, addr, len)                                  \
 234         ((pcp)->p_htag0 == (htag0) &&                                   \
 235         (pcp)->p_addr == (addr) &&                                      \
 236         (pcp)->p_len >= (len))
 237
 238 #define P_MATCH_PP(pcp, htag0, addr, len, pp)                           \
 239         ((pcp)->p_pp == (pp) &&                                         \
 240         (pcp)->p_htag0 == (htag0) &&                                    \
 241         (pcp)->p_addr == (addr) &&                                      \
 242         (pcp)->p_len >= (len))
 243
 244 #define plink2pcache(pl)        ((struct seg_pcache *)((uintptr_t)(pl) - \
 245     offsetof(struct seg_pcache, p_plink)))
 246
 247 #define hlink2phash(hl, l)      ((struct seg_phash *)((uintptr_t)(hl) - \
 248     offsetof(struct seg_phash, p_halink[l])))
 249
 250 /*
 251  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
 252  * active hash bucket lists. We maintain active bucket lists to reduce the
 253  * overhead of finding active buckets during asynchronous purging since there
 254  * can be 10s of millions of buckets on a large system but only a small subset
 255  * of them in actual use.
 256  *
 257  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
 258  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
 259  * buckets. The other list is used by asynchronous purge thread. This allows
 260  * the purge thread to walk its active list without holding seg_pmem_mtx for a
 261  * long time. When asynchronous thread is done with its list it switches to
 262  * current active list and makes the list it just finished processing as
 263  * current active list.
 264  *
 265  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
 266  * yet on any list.  seg_premove_abuck() may remove the bucket from either
 267  * list. If the bucket is on current list it will be always removed. Otherwise
 268  * the bucket is only removed if asynchronous purge thread is not currently
 269  * running or seg_premove_abuck() is called by asynchronous purge thread
 270  * itself. A given bucket can only be on one of active lists at a time. These
 271  * routines should be called with per bucket lock held.  The routines use
 272  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
 273  * the first entry is added to the bucket chain and seg_premove_abuck() must
 274  * be called after the last pcp entry is deleted from its chain. Per bucket
 275  * lock should be held by the callers.  This avoids a potential race condition
 276  * when seg_premove_abuck() removes a bucket after pcp entries are added to
 277  * its list after the caller checked that the bucket has no entries. (this
 278  * race would cause a loss of an active bucket from the active lists).
 279  *
 280  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
 281  * New entries are added to the end of the list since LRU is used as the
 282  * purging policy.
 283  */
 284 static void
 285 seg_padd_abuck(struct seg_phash *hp)
 286 {
 287         int lix;
 288
 289         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 290         ASSERT((struct seg_phash *)hp->p_hnext != hp);
 291         ASSERT((struct seg_phash *)hp->p_hprev != hp);
 292         ASSERT(hp->p_hnext == hp->p_hprev);
 293         ASSERT(!IS_PCP_WIRED(hp->p_hnext));
 294         ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
 295         ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
 296         ASSERT(hp >= seg_phashtab_win &&
 297             hp < &seg_phashtab_win[seg_phashsize_win]);
 298
 299         /*
 300          * This bucket can already be on one of active lists
 301          * since seg_premove_abuck() may have failed to remove it
 302          * before.
 303          */
 304         mutex_enter(&seg_pmem_mtx);
 305         lix = seg_pahcur;
 306         ASSERT(lix >= 0 && lix <= 1);
 307         if (hp->p_halink[lix].p_lnext != NULL) {
 308                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 309                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 310                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 311                 mutex_exit(&seg_pmem_mtx);
 312                 return;
 313         }
 314         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 315
 316         /*
 317          * If this bucket is still on list !lix async thread can't yet remove
 318          * it since we hold here per bucket lock. In this case just return
 319          * since async thread will eventually find and process this bucket.
 320          */
 321         if (hp->p_halink[!lix].p_lnext != NULL) {
 322                 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
 323                 mutex_exit(&seg_pmem_mtx);
 324                 return;
 325         }
 326         ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 327         /*
 328          * This bucket is not on any active bucket list yet.
 329          * Add the bucket to the tail of current active list.
 330          */
 331         hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
 332         hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
 333         seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
 334         seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
 335         mutex_exit(&seg_pmem_mtx);
 336 }
 337
 338 static void
 339 seg_premove_abuck(struct seg_phash *hp, int athr)
 340 {
 341         int lix;
 342
 343         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 344         ASSERT((struct seg_phash *)hp->p_hnext == hp);
 345         ASSERT((struct seg_phash *)hp->p_hprev == hp);
 346         ASSERT(hp >= seg_phashtab_win &&
 347             hp < &seg_phashtab_win[seg_phashsize_win]);
 348
 349         if (athr) {
 350                 ASSERT(seg_pathr_on);
 351                 ASSERT(seg_pahcur <= 1);
 352                 /*
 353                  * We are called by asynchronous thread that found this bucket
 354                  * on not currently active (i.e. !seg_pahcur) list. Remove it
 355                  * from there.  Per bucket lock we are holding makes sure
 356                  * seg_pinsert() can't sneak in and add pcp entries to this
 357                  * bucket right before we remove the bucket from its list.
 358                  */
 359                 lix = !seg_pahcur;
 360                 ASSERT(hp->p_halink[lix].p_lnext != NULL);
 361                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 362                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 363                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 364                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 365                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 366                 hp->p_halink[lix].p_lnext = NULL;
 367                 hp->p_halink[lix].p_lprev = NULL;
 368                 return;
 369         }
 370
 371         mutex_enter(&seg_pmem_mtx);
 372         lix = seg_pahcur;
 373         ASSERT(lix >= 0 && lix <= 1);
 374
 375         /*
 376          * If the bucket is on currently active list just remove it from
 377          * there.
 378          */
 379         if (hp->p_halink[lix].p_lnext != NULL) {
 380                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 381                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 382                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 383                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 384                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 385                 hp->p_halink[lix].p_lnext = NULL;
 386                 hp->p_halink[lix].p_lprev = NULL;
 387                 mutex_exit(&seg_pmem_mtx);
 388                 return;
 389         }
 390         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 391
 392         /*
 393          * If asynchronous thread is not running we can remove the bucket from
 394          * not currently active list. The bucket must be on this list since we
 395          * already checked that it's not on the other list and the bucket from
 396          * which we just deleted the last pcp entry must be still on one of the
 397          * active bucket lists.
 398          */
 399         lix = !lix;
 400         ASSERT(hp->p_halink[lix].p_lnext != NULL);
 401         ASSERT(hp->p_halink[lix].p_lprev != NULL);
 402
 403         if (!seg_pathr_on) {
 404                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 405                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 406                 hp->p_halink[lix].p_lnext = NULL;
 407                 hp->p_halink[lix].p_lprev = NULL;
 408         }
 409         mutex_exit(&seg_pmem_mtx);
 410 }
 411
 412 /*
 413  * Check if bucket pointed by hp already has a pcp entry that matches request
 414  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
 415  * Also delete matching entries that cover smaller address range but start
 416  * at the same address as addr argument. Return the list of deleted entries if
 417  * any. This is an internal helper function called from seg_pinsert() only
 418  * for non wired shadow lists. The caller already holds a per seg/amp list
 419  * lock.
 420  */
 421 static struct seg_pcache *
 422 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
 423     caddr_t addr, size_t len, int *found)
 424 {
 425         struct seg_pcache *pcp;
 426         struct seg_pcache *delcallb_list = NULL;
 427
 428         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 429
 430         *found = 0;
 431         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 432             pcp = pcp->p_hnext) {
 433                 ASSERT(pcp->p_hashp == hp);
 434                 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
 435                         ASSERT(!IS_PCP_WIRED(pcp));
 436                         if (pcp->p_len < len) {
 437                                 pcache_link_t *plinkp;
 438                                 if (pcp->p_active) {
 439                                         continue;
 440                                 }
 441                                 plinkp = &pcp->p_plink;
 442                                 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
 443                                 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
 444                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 445                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 446                                 pcp->p_hprev = delcallb_list;
 447                                 delcallb_list = pcp;
 448                         } else {
 449                                 *found = 1;
 450                                 break;
 451                         }
 452                 }
 453         }
 454         return (delcallb_list);
 455 }
 456
 457 /*
 458  * lookup an address range in pagelock cache. Return shadow list and bump up
 459  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
 460  * as a lookup tag.
 461  */
 462 struct page **
 463 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 464     enum seg_rw rw, uint_t flags)
 465 {
 466         struct seg_pcache *pcp;
 467         struct seg_phash *hp;
 468         void *htag0;
 469
 470         ASSERT(seg != NULL);
 471         ASSERT(rw == S_READ || rw == S_WRITE);
 472
 473         /*
 474          * Skip pagelock cache, while DR is in progress or
 475          * seg_pcache is off.
 476          */
 477         if (seg_pdisabled) {
 478                 return (NULL);
 479         }
 480         ASSERT(seg_phashsize_win != 0);
 481
 482         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 483         hp = P_HASHBP(seg, htag0, addr, flags);
 484         mutex_enter(&hp->p_hmutex);
 485         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 486             pcp = pcp->p_hnext) {
 487                 ASSERT(pcp->p_hashp == hp);
 488                 if (P_MATCH(pcp, htag0, addr, len)) {
 489                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 490                         /*
 491                          * If this request wants to write pages
 492                          * but write permissions starting from
 493                          * addr don't cover the entire length len
 494                          * return lookup failure back to the caller.
 495                          * It will check protections and fail this
 496                          * pagelock operation with EACCESS error.
 497                          */
 498                         if (rw == S_WRITE && pcp->p_wlen < len) {
 499                                 break;
 500                         }
 501                         if (pcp->p_active == UINT_MAX) {
 502                                 break;
 503                         }
 504                         pcp->p_active++;
 505                         if (rw == S_WRITE && !pcp->p_write) {
 506                                 pcp->p_write = 1;
 507                         }
 508                         mutex_exit(&hp->p_hmutex);
 509                         return (pcp->p_pp);
 510                 }
 511         }
 512         mutex_exit(&hp->p_hmutex);
 513         return (NULL);
 514 }
 515
 516 /*
 517  * mark address range inactive. If the cache is off or the address range is
 518  * not in the cache or another shadow list that covers bigger range is found
 519  * we call the segment driver to reclaim the pages. Otherwise just decrement
 520  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
 521  * otherwise use seg as a lookup tag.
 522  */
 523 void
 524 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
 525     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
 526     seg_preclaim_cbfunc_t callback)
 527 {
 528         struct seg_pcache *pcp;
 529         struct seg_phash *hp;
 530         kmutex_t *pmtx = NULL;
 531         pcache_link_t *pheadp;
 532         void *htag0;
 533         pgcnt_t npages = 0;
 534         int keep = 0;
 535
 536         ASSERT(seg != NULL);
 537         ASSERT(rw == S_READ || rw == S_WRITE);
 538
 539         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 540
 541         /*
 542          * Skip lookup if pcache is not configured.
 543          */
 544         if (seg_phashsize_win == 0) {
 545                 goto out;
 546         }
 547
 548         /*
 549          * Grab per seg/amp lock before hash lock if we are going to remove
 550          * inactive entry from pcache.
 551          */
 552         if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
 553                 if (amp == NULL) {
 554                         pheadp = &seg->s_phead;
 555                         pmtx = &seg->s_pmtx;
 556                 } else {
 557                         pheadp = &amp->a_phead;
 558                         pmtx = &amp->a_pmtx;
 559                 }
 560                 mutex_enter(pmtx);
 561         }
 562
 563         hp = P_HASHBP(seg, htag0, addr, flags);
 564         mutex_enter(&hp->p_hmutex);
 565 again:
 566         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 567             pcp = pcp->p_hnext) {
 568                 ASSERT(pcp->p_hashp == hp);
 569                 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
 570                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 571                         ASSERT(pcp->p_active);
 572                         if (keep) {
 573                                 /*
 574                                  * Don't remove this pcp entry
 575                                  * if we didn't find duplicate
 576                                  * shadow lists on second search.
 577                                  * Somebody removed those duplicates
 578                                  * since we dropped hash lock after first
 579                                  * search.
 580                                  */
 581                                 ASSERT(pmtx != NULL);
 582                                 ASSERT(!IS_PFLAGS_WIRED(flags));
 583                                 mutex_exit(pmtx);
 584                                 pmtx = NULL;
 585                         }
 586                         pcp->p_active--;
 587                         if (pcp->p_active == 0 && (pmtx != NULL ||
 588                             (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
 589
 590                                 /*
 591                                  * This entry is no longer active.  Remove it
 592                                  * now either because pcaching is temporarily
 593                                  * disabled or there're other pcp entries that
 594                                  * can match this pagelock request (i.e. this
 595                                  * entry is a duplicate).
 596                                  */
 597
 598                                 ASSERT(callback == pcp->p_callback);
 599                                 if (pmtx != NULL) {
 600                                         pcache_link_t *plinkp = &pcp->p_plink;
 601                                         ASSERT(!IS_PCP_WIRED(pcp));
 602                                         ASSERT(pheadp->p_lnext != pheadp);
 603                                         ASSERT(pheadp->p_lprev != pheadp);
 604                                         plinkp->p_lprev->p_lnext =
 605                                             plinkp->p_lnext;
 606                                         plinkp->p_lnext->p_lprev =
 607                                             plinkp->p_lprev;
 608                                 }
 609                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 610                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 611                                 if (!IS_PCP_WIRED(pcp) &&
 612                                     hp->p_hnext == (struct seg_pcache *)hp) {
 613                                         /*
 614                                          * We removed the last entry from this
 615                                          * bucket.  Now remove the bucket from
 616                                          * its active list.
 617                                          */
 618                                         seg_premove_abuck(hp, 0);
 619                                 }
 620                                 mutex_exit(&hp->p_hmutex);
 621                                 if (pmtx != NULL) {
 622                                         mutex_exit(pmtx);
 623                                 }
 624                                 len = pcp->p_len;
 625                                 npages = btop(len);
 626                                 if (rw != S_WRITE && pcp->p_write) {
 627                                         rw = S_WRITE;
 628                                 }
 629                                 kmem_cache_free(seg_pkmcache, pcp);
 630                                 goto out;
 631                         } else {
 632                                 /*
 633                                  * We found a matching pcp entry but will not
 634                                  * free it right away even if it's no longer
 635                                  * active.
 636                                  */
 637                                 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
 638                                         /*
 639                                          * Set the reference bit and mark the
 640                                          * time of last access to this pcp
 641                                          * so that asynchronous thread doesn't
 642                                          * free it immediately since
 643                                          * it may be reactivated very soon.
 644                                          */
 645                                         pcp->p_lbolt = ddi_get_lbolt();
 646                                         pcp->p_ref = 1;
 647                                 }
 648                                 mutex_exit(&hp->p_hmutex);
 649                                 if (pmtx != NULL) {
 650                                         mutex_exit(pmtx);
 651                                 }
 652                                 return;
 653                         }
 654                 } else if (!IS_PFLAGS_WIRED(flags) &&
 655                     P_MATCH(pcp, htag0, addr, len)) {
 656                         /*
 657                          * This is a duplicate pcp entry.  This situation may
 658                          * happen if a bigger shadow list that covers our
 659                          * range was added while our entry was still active.
 660                          * Now we can free our pcp entry if it becomes
 661                          * inactive.
 662                          */
 663                         if (!pcp->p_active) {
 664                                 /*
 665                                  * Mark this entry as referenced just in case
 666                                  * we'll free our own pcp entry soon.
 667                                  */
 668                                 pcp->p_lbolt = ddi_get_lbolt();
 669                                 pcp->p_ref = 1;
 670                         }
 671                         if (pmtx != NULL) {
 672                                 /*
 673                                  * we are already holding pmtx and found a
 674                                  * duplicate.  Don't keep our own pcp entry.
 675                                  */
 676                                 keep = 0;
 677                                 continue;
 678                         }
 679                         /*
 680                          * We have to use mutex_tryenter to attempt to lock
 681                          * seg/amp list lock since we already hold hash lock
 682                          * and seg/amp list lock is above hash lock in lock
 683                          * order.  If mutex_tryenter fails drop hash lock and
 684                          * retake both locks in correct order and research
 685                          * this hash chain.
 686                          */
 687                         ASSERT(keep == 0);
 688                         if (amp == NULL) {
 689                                 pheadp = &seg->s_phead;
 690                                 pmtx = &seg->s_pmtx;
 691                         } else {
 692                                 pheadp = &amp->a_phead;
 693                                 pmtx = &amp->a_pmtx;
 694                         }
 695                         if (!mutex_tryenter(pmtx)) {
 696                                 mutex_exit(&hp->p_hmutex);
 697                                 mutex_enter(pmtx);
 698                                 mutex_enter(&hp->p_hmutex);
 699                                 /*
 700                                  * If we don't find bigger shadow list on
 701                                  * second search (it may happen since we
 702                                  * dropped bucket lock) keep the entry that
 703                                  * matches our own shadow list.
 704                                  */
 705                                 keep = 1;
 706                                 goto again;
 707                         }
 708                 }
 709         }
 710         mutex_exit(&hp->p_hmutex);
 711         if (pmtx != NULL) {
 712                 mutex_exit(pmtx);
 713         }
 714 out:
 715         (*callback)(htag0, addr, len, pp, rw, 0);
 716         if (npages) {
 717                 mutex_enter(&seg_pmem_mtx);
 718                 ASSERT(seg_plocked >= npages);
 719                 seg_plocked -= npages;
 720                 if (!IS_PFLAGS_WIRED(flags)) {
 721                         ASSERT(seg_plocked_window >= npages);
 722                         seg_plocked_window -= npages;
 723                 }
 724                 mutex_exit(&seg_pmem_mtx);
 725         }
 726
 727 }
 728
 729 #ifdef DEBUG
 730 static uint32_t p_insert_chk_mtbf = 0;
 731 #endif
 732
 733 /*
 734  * The seg_pinsert_check() is used by segment drivers to predict whether
 735  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 736  */
 737 /*ARGSUSED*/
 738 int
 739 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
 740     size_t len, uint_t flags)
 741 {
 742         ASSERT(seg != NULL);
 743
 744 #ifdef DEBUG
 745         if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 746                 return (SEGP_FAIL);
 747         }
 748 #endif
 749
 750         if (seg_pdisabled) {
 751                 return (SEGP_FAIL);
 752         }
 753         ASSERT(seg_phashsize_win != 0);
 754
 755         if (IS_PFLAGS_WIRED(flags)) {
 756                 return (SEGP_SUCCESS);
 757         }
 758
 759         if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
 760                 return (SEGP_FAIL);
 761         }
 762
 763         if (freemem < desfree) {
 764                 return (SEGP_FAIL);
 765         }
 766
 767         return (SEGP_SUCCESS);
 768 }
 769
 770 #ifdef DEBUG
 771 static uint32_t p_insert_mtbf = 0;
 772 #endif
 773
 774 /*
 775  * Insert address range with shadow list into pagelock cache if there's no
 776  * shadow list already cached for this address range. If the cache is off or
 777  * caching is temporarily disabled or the allowed 'window' is exceeded return
 778  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
 779  *
 780  * For non wired shadow lists (segvn case) include address in the hashing
 781  * function to avoid linking all the entries from the same segment or amp on
 782  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
 783  * pcache entries are also linked on a per segment/amp list so that all
 784  * entries can be found quickly during seg/amp purge without walking the
 785  * entire pcache hash table.  For wired shadow lists (segspt case) we
 786  * don't use address hashing and per segment linking because the caller
 787  * currently inserts only one entry per segment that covers the entire
 788  * segment. If we used per segment linking even for segspt it would complicate
 789  * seg_ppurge_wiredpp() locking.
 790  *
 791  * Both hash bucket and per seg/amp locks need to be held before adding a non
 792  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
 793  * first.
 794  *
 795  * This function will also remove from pcache old inactive shadow lists that
 796  * overlap with this request but cover smaller range for the same start
 797  * address.
 798  */
 799 int
 800 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 801     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
 802     seg_preclaim_cbfunc_t callback)
 803 {
 804         struct seg_pcache *pcp;
 805         struct seg_phash *hp;
 806         pgcnt_t npages;
 807         pcache_link_t *pheadp;
 808         kmutex_t *pmtx;
 809         struct seg_pcache *delcallb_list = NULL;
 810
 811         ASSERT(seg != NULL);
 812         ASSERT(rw == S_READ || rw == S_WRITE);
 813         ASSERT(rw == S_READ || wlen == len);
 814         ASSERT(rw == S_WRITE || wlen <= len);
 815         ASSERT(amp == NULL || wlen == len);
 816
 817 #ifdef DEBUG
 818         if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 819                 return (SEGP_FAIL);
 820         }
 821 #endif
 822
 823         if (seg_pdisabled) {
 824                 return (SEGP_FAIL);
 825         }
 826         ASSERT(seg_phashsize_win != 0);
 827
 828         ASSERT((len & PAGEOFFSET) == 0);
 829         npages = btop(len);
 830         mutex_enter(&seg_pmem_mtx);
 831         if (!IS_PFLAGS_WIRED(flags)) {
 832                 if (seg_plocked_window + npages > seg_pmaxwindow) {
 833                         mutex_exit(&seg_pmem_mtx);
 834                         return (SEGP_FAIL);
 835                 }
 836                 seg_plocked_window += npages;
 837         }
 838         seg_plocked += npages;
 839         mutex_exit(&seg_pmem_mtx);
 840
 841         pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
 842         /*
 843          * If amp is not NULL set htag0 to amp otherwise set it to seg.
 844          */
 845         if (amp == NULL) {
 846                 pcp->p_htag0 = (void *)seg;
 847                 pcp->p_flags = flags & 0xffff;
 848         } else {
 849                 pcp->p_htag0 = (void *)amp;
 850                 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
 851         }
 852         pcp->p_addr = addr;
 853         pcp->p_len = len;
 854         pcp->p_wlen = wlen;
 855         pcp->p_pp = pp;
 856         pcp->p_write = (rw == S_WRITE);
 857         pcp->p_callback = callback;
 858         pcp->p_active = 1;
 859
 860         hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
 861         if (!IS_PFLAGS_WIRED(flags)) {
 862                 int found;
 863                 void *htag0;
 864                 if (amp == NULL) {
 865                         pheadp = &seg->s_phead;
 866                         pmtx = &seg->s_pmtx;
 867                         htag0 = (void *)seg;
 868                 } else {
 869                         pheadp = &amp->a_phead;
 870                         pmtx = &amp->a_pmtx;
 871                         htag0 = (void *)amp;
 872                 }
 873                 mutex_enter(pmtx);
 874                 mutex_enter(&hp->p_hmutex);
 875                 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
 876                     len, &found);
 877                 if (found) {
 878                         mutex_exit(&hp->p_hmutex);
 879                         mutex_exit(pmtx);
 880                         mutex_enter(&seg_pmem_mtx);
 881                         seg_plocked -= npages;
 882                         seg_plocked_window -= npages;
 883                         mutex_exit(&seg_pmem_mtx);
 884                         kmem_cache_free(seg_pkmcache, pcp);
 885                         goto out;
 886                 }
 887                 pcp->p_plink.p_lnext = pheadp->p_lnext;
 888                 pcp->p_plink.p_lprev = pheadp;
 889                 pheadp->p_lnext->p_lprev = &pcp->p_plink;
 890                 pheadp->p_lnext = &pcp->p_plink;
 891         } else {
 892                 mutex_enter(&hp->p_hmutex);
 893         }
 894         pcp->p_hashp = hp;
 895         pcp->p_hnext = hp->p_hnext;
 896         pcp->p_hprev = (struct seg_pcache *)hp;
 897         hp->p_hnext->p_hprev = pcp;
 898         hp->p_hnext = pcp;
 899         if (!IS_PFLAGS_WIRED(flags) &&
 900             hp->p_hprev == pcp) {
 901                 seg_padd_abuck(hp);
 902         }
 903         mutex_exit(&hp->p_hmutex);
 904         if (!IS_PFLAGS_WIRED(flags)) {
 905                 mutex_exit(pmtx);
 906         }
 907
 908 out:
 909         npages = 0;
 910         while (delcallb_list != NULL) {
 911                 pcp = delcallb_list;
 912                 delcallb_list = pcp->p_hprev;
 913                 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
 914                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
 915                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
 916                 npages += btop(pcp->p_len);
 917                 kmem_cache_free(seg_pkmcache, pcp);
 918         }
 919         if (npages) {
 920                 ASSERT(!IS_PFLAGS_WIRED(flags));
 921                 mutex_enter(&seg_pmem_mtx);
 922                 ASSERT(seg_plocked >= npages);
 923                 ASSERT(seg_plocked_window >= npages);
 924                 seg_plocked -= npages;
 925                 seg_plocked_window -= npages;
 926                 mutex_exit(&seg_pmem_mtx);
 927         }
 928
 929         return (SEGP_SUCCESS);
 930 }
 931
 932 /*
 933  * purge entries from the pagelock cache if not active
 934  * and not recently used.
 935  */
 936 static void
 937 seg_ppurge_async(int force)
 938 {
 939         struct seg_pcache *delcallb_list = NULL;
 940         struct seg_pcache *pcp;
 941         struct seg_phash *hp;
 942         pgcnt_t npages = 0;
 943         pgcnt_t npages_window = 0;
 944         pgcnt_t npgs_to_purge;
 945         pgcnt_t npgs_purged = 0;
 946         int hlinks = 0;
 947         int hlix;
 948         pcache_link_t *hlinkp;
 949         pcache_link_t *hlnextp = NULL;
 950         int lowmem;
 951         int trim;
 952
 953         ASSERT(seg_phashsize_win != 0);
 954
 955         /*
 956          * if the cache is off or empty, return
 957          */
 958         if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 959                 return;
 960         }
 961
 962         if (!force) {
 963                 lowmem = 0;
 964                 trim = 0;
 965                 if (freemem < lotsfree + needfree) {
 966                         spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
 967                         if (fmem <= 5 * (desfree >> 2)) {
 968                                 lowmem = 1;
 969                         } else if (fmem <= 7 * (lotsfree >> 3)) {
 970                                 if (seg_plocked_window >=
 971                                     (availrmem_initial >> 1)) {
 972                                         lowmem = 1;
 973                                 }
 974                         } else if (fmem < lotsfree) {
 975                                 if (seg_plocked_window >=
 976                                     3 * (availrmem_initial >> 2)) {
 977                                         lowmem = 1;
 978                                 }
 979                         }
 980                 }
 981                 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
 982                         trim = 1;
 983                 }
 984                 if (!lowmem && !trim) {
 985                         return;
 986                 }
 987                 npgs_to_purge = seg_plocked_window >>
 988                     seg_pshrink_shift;
 989                 if (lowmem) {
 990                         npgs_to_purge = MIN(npgs_to_purge,
 991                             MAX(seg_pmaxapurge_npages, desfree));
 992                 } else {
 993                         npgs_to_purge = MIN(npgs_to_purge,
 994                             seg_pmaxapurge_npages);
 995                 }
 996                 if (npgs_to_purge == 0) {
 997                         return;
 998                 }
 999         } else {
1000                 struct seg_phash_wired *hpw;
1001
1002                 ASSERT(seg_phashsize_wired != 0);
1003
1004                 for (hpw = seg_phashtab_wired;
1005                     hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1006
1007                         if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1008                                 continue;
1009                         }
1010
1011                         mutex_enter(&hpw->p_hmutex);
1012
1013                         for (pcp = hpw->p_hnext;
1014                             pcp != (struct seg_pcache *)hpw;
1015                             pcp = pcp->p_hnext) {
1016
1017                                 ASSERT(IS_PCP_WIRED(pcp));
1018                                 ASSERT(pcp->p_hashp ==
1019                                     (struct seg_phash *)hpw);
1020
1021                                 if (pcp->p_active) {
1022                                         continue;
1023                                 }
1024                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1025                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1026                                 pcp->p_hprev = delcallb_list;
1027                                 delcallb_list = pcp;
1028                         }
1029                         mutex_exit(&hpw->p_hmutex);
1030                 }
1031         }
1032
1033         mutex_enter(&seg_pmem_mtx);
1034         if (seg_pathr_on) {
1035                 mutex_exit(&seg_pmem_mtx);
1036                 goto runcb;
1037         }
1038         seg_pathr_on = 1;
1039         mutex_exit(&seg_pmem_mtx);
1040         ASSERT(seg_pahcur <= 1);
1041         hlix = !seg_pahcur;
1042
1043 again:
1044         for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1045             hlinkp = hlnextp) {
1046
1047                 hlnextp = hlinkp->p_lnext;
1048                 ASSERT(hlnextp != NULL);
1049
1050                 hp = hlink2phash(hlinkp, hlix);
1051                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1052                         seg_pathr_empty_ahb++;
1053                         continue;
1054                 }
1055                 seg_pathr_full_ahb++;
1056                 mutex_enter(&hp->p_hmutex);
1057
1058                 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1059                     pcp = pcp->p_hnext) {
1060                         pcache_link_t *pheadp;
1061                         pcache_link_t *plinkp;
1062                         void *htag0;
1063                         kmutex_t *pmtx;
1064
1065                         ASSERT(!IS_PCP_WIRED(pcp));
1066                         ASSERT(pcp->p_hashp == hp);
1067
1068                         if (pcp->p_active) {
1069                                 continue;
1070                         }
1071                         if (!force && pcp->p_ref &&
1072                             PCP_AGE(pcp) < seg_pmax_pcpage) {
1073                                 pcp->p_ref = 0;
1074                                 continue;
1075                         }
1076                         plinkp = &pcp->p_plink;
1077                         htag0 = pcp->p_htag0;
1078                         if (pcp->p_flags & SEGP_AMP) {
1079                                 pheadp = &((amp_t *)htag0)->a_phead;
1080                                 pmtx = &((amp_t *)htag0)->a_pmtx;
1081                         } else {
1082                                 pheadp = &((seg_t *)htag0)->s_phead;
1083                                 pmtx = &((seg_t *)htag0)->s_pmtx;
1084                         }
1085                         if (!mutex_tryenter(pmtx)) {
1086                                 continue;
1087                         }
1088                         ASSERT(pheadp->p_lnext != pheadp);
1089                         ASSERT(pheadp->p_lprev != pheadp);
1090                         plinkp->p_lprev->p_lnext =
1091                             plinkp->p_lnext;
1092                         plinkp->p_lnext->p_lprev =
1093                             plinkp->p_lprev;
1094                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1095                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1096                         mutex_exit(pmtx);
1097                         pcp->p_hprev = delcallb_list;
1098                         delcallb_list = pcp;
1099                         npgs_purged += btop(pcp->p_len);
1100                 }
1101                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1102                         seg_premove_abuck(hp, 1);
1103                 }
1104                 mutex_exit(&hp->p_hmutex);
1105                 if (npgs_purged >= seg_plocked_window) {
1106                         break;
1107                 }
1108                 if (!force) {
1109                         if (npgs_purged >= npgs_to_purge) {
1110                                 break;
1111                         }
1112                         if (!trim && !(seg_pathr_full_ahb & 15)) {
1113                                 ASSERT(lowmem);
1114                                 if (freemem >= lotsfree + needfree) {
1115                                         break;
1116                                 }
1117                         }
1118                 }
1119         }
1120
1121         if (hlinkp == &seg_pahhead[hlix]) {
1122                 /*
1123                  * We processed the entire hlix active bucket list
1124                  * but didn't find enough pages to reclaim.
1125                  * Switch the lists and walk the other list
1126                  * if we haven't done it yet.
1127                  */
1128                 mutex_enter(&seg_pmem_mtx);
1129                 ASSERT(seg_pathr_on);
1130                 ASSERT(seg_pahcur == !hlix);
1131                 seg_pahcur = hlix;
1132                 mutex_exit(&seg_pmem_mtx);
1133                 if (++hlinks < 2) {
1134                         hlix = !hlix;
1135                         goto again;
1136                 }
1137         } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1138             seg_pahhead[hlix].p_lnext != hlinkp) {
1139                 ASSERT(hlinkp != NULL);
1140                 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1141                 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1142                 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1143
1144                 /*
1145                  * Reinsert the header to point to hlinkp
1146                  * so that we start from hlinkp bucket next time around.
1147                  */
1148                 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1149                 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1150                 seg_pahhead[hlix].p_lnext = hlinkp;
1151                 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1152                 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1153                 hlinkp->p_lprev = &seg_pahhead[hlix];
1154         }
1155
1156         mutex_enter(&seg_pmem_mtx);
1157         ASSERT(seg_pathr_on);
1158         seg_pathr_on = 0;
1159         mutex_exit(&seg_pmem_mtx);
1160
1161 runcb:
1162         /*
1163          * Run the delayed callback list. segments/amps can't go away until
1164          * callback is executed since they must have non 0 softlockcnt. That's
1165          * why we don't need to hold as/seg/amp locks to execute the callback.
1166          */
1167         while (delcallb_list != NULL) {
1168                 pcp = delcallb_list;
1169                 delcallb_list = pcp->p_hprev;
1170                 ASSERT(!pcp->p_active);
1171                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1172                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1173                 npages += btop(pcp->p_len);
1174                 if (!IS_PCP_WIRED(pcp)) {
1175                         npages_window += btop(pcp->p_len);
1176                 }
1177                 kmem_cache_free(seg_pkmcache, pcp);
1178         }
1179         if (npages) {
1180                 mutex_enter(&seg_pmem_mtx);
1181                 ASSERT(seg_plocked >= npages);
1182                 ASSERT(seg_plocked_window >= npages_window);
1183                 seg_plocked -= npages;
1184                 seg_plocked_window -= npages_window;
1185                 mutex_exit(&seg_pmem_mtx);
1186         }
1187 }
1188
1189 /*
1190  * Remove cached pages for segment(s) entries from hashtable.  The segments
1191  * are identified by pp array. This is useful for multiple seg's cached on
1192  * behalf of dummy segment (ISM/DISM) with common pp array.
1193  */
1194 void
1195 seg_ppurge_wiredpp(struct page **pp)
1196 {
1197         struct seg_pcache *pcp;
1198         struct seg_phash_wired *hp;
1199         pgcnt_t npages = 0;
1200         struct  seg_pcache *delcallb_list = NULL;
1201
1202         /*
1203          * if the cache is empty, return
1204          */
1205         if (seg_plocked == 0) {
1206                 return;
1207         }
1208         ASSERT(seg_phashsize_wired != 0);
1209
1210         for (hp = seg_phashtab_wired;
1211             hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1212                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1213                         continue;
1214                 }
1215                 mutex_enter(&hp->p_hmutex);
1216                 pcp = hp->p_hnext;
1217                 while (pcp != (struct seg_pcache *)hp) {
1218                         ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1219                         ASSERT(IS_PCP_WIRED(pcp));
1220                         /*
1221                          * purge entries which are not active
1222                          */
1223                         if (!pcp->p_active && pcp->p_pp == pp) {
1224                                 ASSERT(pcp->p_htag0 != NULL);
1225                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1226                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1227                                 pcp->p_hprev = delcallb_list;
1228                                 delcallb_list = pcp;
1229                         }
1230                         pcp = pcp->p_hnext;
1231                 }
1232                 mutex_exit(&hp->p_hmutex);
1233                 /*
1234                  * segments can't go away until callback is executed since
1235                  * they must have non 0 softlockcnt. That's why we don't
1236                  * need to hold as/seg locks to execute the callback.
1237                  */
1238                 while (delcallb_list != NULL) {
1239                         int done;
1240                         pcp = delcallb_list;
1241                         delcallb_list = pcp->p_hprev;
1242                         ASSERT(!pcp->p_active);
1243                         done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1244                             pcp->p_len, pcp->p_pp,
1245                             pcp->p_write ? S_WRITE : S_READ, 1);
1246                         npages += btop(pcp->p_len);
1247                         ASSERT(IS_PCP_WIRED(pcp));
1248                         kmem_cache_free(seg_pkmcache, pcp);
1249                         if (done) {
1250                                 ASSERT(delcallb_list == NULL);
1251                                 goto out;
1252                         }
1253                 }
1254         }
1255
1256 out:
1257         mutex_enter(&seg_pmem_mtx);
1258         ASSERT(seg_plocked >= npages);
1259         seg_plocked -= npages;
1260         mutex_exit(&seg_pmem_mtx);
1261 }
1262
1263 /*
1264  * purge all entries for a given segment. Since we
1265  * callback into the segment driver directly for page
1266  * reclaim the caller needs to hold the right locks.
1267  */
1268 void
1269 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1270 {
1271         struct seg_pcache *delcallb_list = NULL;
1272         struct seg_pcache *pcp;
1273         struct seg_phash *hp;
1274         pgcnt_t npages = 0;
1275         void *htag0;
1276
1277         if (seg_plocked == 0) {
1278                 return;
1279         }
1280         ASSERT(seg_phashsize_win != 0);
1281
1282         /*
1283          * If amp is not NULL use amp as a lookup tag otherwise use seg
1284          * as a lookup tag.
1285          */
1286         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1287         ASSERT(htag0 != NULL);
1288         if (IS_PFLAGS_WIRED(flags)) {
1289                 hp = P_HASHBP(seg, htag0, 0, flags);
1290                 mutex_enter(&hp->p_hmutex);
1291                 pcp = hp->p_hnext;
1292                 while (pcp != (struct seg_pcache *)hp) {
1293                         ASSERT(pcp->p_hashp == hp);
1294                         ASSERT(IS_PCP_WIRED(pcp));
1295                         if (pcp->p_htag0 == htag0) {
1296                                 if (pcp->p_active) {
1297                                         break;
1298                                 }
1299                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1300                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1301                                 pcp->p_hprev = delcallb_list;
1302                                 delcallb_list = pcp;
1303                         }
1304                         pcp = pcp->p_hnext;
1305                 }
1306                 mutex_exit(&hp->p_hmutex);
1307         } else {
1308                 pcache_link_t *plinkp;
1309                 pcache_link_t *pheadp;
1310                 kmutex_t *pmtx;
1311
1312                 if (amp == NULL) {
1313                         ASSERT(seg != NULL);
1314                         pheadp = &seg->s_phead;
1315                         pmtx = &seg->s_pmtx;
1316                 } else {
1317                         pheadp = &amp->a_phead;
1318                         pmtx = &amp->a_pmtx;
1319                 }
1320                 mutex_enter(pmtx);
1321                 while ((plinkp = pheadp->p_lnext) != pheadp) {
1322                         pcp = plink2pcache(plinkp);
1323                         ASSERT(!IS_PCP_WIRED(pcp));
1324                         ASSERT(pcp->p_htag0 == htag0);
1325                         hp = pcp->p_hashp;
1326                         mutex_enter(&hp->p_hmutex);
1327                         if (pcp->p_active) {
1328                                 mutex_exit(&hp->p_hmutex);
1329                                 break;
1330                         }
1331                         ASSERT(plinkp->p_lprev == pheadp);
1332                         pheadp->p_lnext = plinkp->p_lnext;
1333                         plinkp->p_lnext->p_lprev = pheadp;
1334                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1335                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1336                         pcp->p_hprev = delcallb_list;
1337                         delcallb_list = pcp;
1338                         if (hp->p_hnext == (struct seg_pcache *)hp) {
1339                                 seg_premove_abuck(hp, 0);
1340                         }
1341                         mutex_exit(&hp->p_hmutex);
1342                 }
1343                 mutex_exit(pmtx);
1344         }
1345         while (delcallb_list != NULL) {
1346                 pcp = delcallb_list;
1347                 delcallb_list = pcp->p_hprev;
1348                 ASSERT(!pcp->p_active);
1349                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1350                     pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1351                 npages += btop(pcp->p_len);
1352                 kmem_cache_free(seg_pkmcache, pcp);
1353         }
1354         mutex_enter(&seg_pmem_mtx);
1355         ASSERT(seg_plocked >= npages);
1356         seg_plocked -= npages;
1357         if (!IS_PFLAGS_WIRED(flags)) {
1358                 ASSERT(seg_plocked_window >= npages);
1359                 seg_plocked_window -= npages;
1360         }
1361         mutex_exit(&seg_pmem_mtx);
1362 }
1363
1364 static void seg_pinit_mem_config(void);
1365
1366 /*
1367  * setup the pagelock cache
1368  */
1369 static void
1370 seg_pinit(void)
1371 {
1372         struct seg_phash *hp;
1373         ulong_t i;
1374         pgcnt_t physmegs;
1375
1376         seg_plocked = 0;
1377         seg_plocked_window = 0;
1378
1379         if (segpcache_enabled == 0) {
1380                 seg_phashsize_win = 0;
1381                 seg_phashsize_wired = 0;
1382                 seg_pdisabled = 1;
1383                 return;
1384         }
1385
1386         seg_pdisabled = 0;
1387         seg_pkmcache = kmem_cache_create("seg_pcache",
1388             sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1389         if (segpcache_pcp_maxage_ticks <= 0) {
1390                 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1391         }
1392         seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1393         seg_pathr_empty_ahb = 0;
1394         seg_pathr_full_ahb = 0;
1395         seg_pshrink_shift = segpcache_shrink_shift;
1396         seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1397
1398         mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1399         mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1400         mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1401         cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1402
1403         physmegs = physmem >> (20 - PAGESHIFT);
1404
1405         /*
1406          * If segpcache_hashsize_win was not set in /etc/system or it has
1407          * absurd value set it to a default.
1408          */
1409         if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1410                 /*
1411                  * Create one bucket per 32K (or at least per 8 pages) of
1412                  * available memory.
1413                  */
1414                 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1415                 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1416         }
1417         if (!ISP2(segpcache_hashsize_win)) {
1418                 ulong_t rndfac = ~(1UL <<
1419                     (highbit(segpcache_hashsize_win) - 1));
1420                 rndfac &= segpcache_hashsize_win;
1421                 segpcache_hashsize_win += rndfac;
1422                 segpcache_hashsize_win = 1 <<
1423                     (highbit(segpcache_hashsize_win) - 1);
1424         }
1425         seg_phashsize_win = segpcache_hashsize_win;
1426         seg_phashtab_win = kmem_zalloc(
1427             seg_phashsize_win * sizeof (struct seg_phash),
1428             KM_SLEEP);
1429         for (i = 0; i < seg_phashsize_win; i++) {
1430                 hp = &seg_phashtab_win[i];
1431                 hp->p_hnext = (struct seg_pcache *)hp;
1432                 hp->p_hprev = (struct seg_pcache *)hp;
1433                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1434         }
1435
1436         seg_pahcur = 0;
1437         seg_pathr_on = 0;
1438         seg_pahhead[0].p_lnext = &seg_pahhead[0];
1439         seg_pahhead[0].p_lprev = &seg_pahhead[0];
1440         seg_pahhead[1].p_lnext = &seg_pahhead[1];
1441         seg_pahhead[1].p_lprev = &seg_pahhead[1];
1442
1443         /*
1444          * If segpcache_hashsize_wired was not set in /etc/system or it has
1445          * absurd value set it to a default.
1446          */
1447         if (segpcache_hashsize_wired == 0 ||
1448             segpcache_hashsize_wired > physmem / 4) {
1449                 /*
1450                  * Choose segpcache_hashsize_wired based on physmem.
1451                  * Create a bucket per 128K bytes upto 256K buckets.
1452                  */
1453                 if (physmegs < 20 * 1024) {
1454                         segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1455                 } else {
1456                         segpcache_hashsize_wired = 256 * 1024;
1457                 }
1458         }
1459         if (!ISP2(segpcache_hashsize_wired)) {
1460                 segpcache_hashsize_wired = 1 <<
1461                     highbit(segpcache_hashsize_wired);
1462         }
1463         seg_phashsize_wired = segpcache_hashsize_wired;
1464         seg_phashtab_wired = kmem_zalloc(
1465             seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1466         for (i = 0; i < seg_phashsize_wired; i++) {
1467                 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1468                 hp->p_hnext = (struct seg_pcache *)hp;
1469                 hp->p_hprev = (struct seg_pcache *)hp;
1470                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1471         }
1472
1473         if (segpcache_maxwindow == 0) {
1474                 if (physmegs < 64) {
1475                         /* 3% of memory */
1476                         segpcache_maxwindow = availrmem >> 5;
1477                 } else if (physmegs < 512) {
1478                         /* 12% of memory */
1479                         segpcache_maxwindow = availrmem >> 3;
1480                 } else if (physmegs < 1024) {
1481                         /* 25% of memory */
1482                         segpcache_maxwindow = availrmem >> 2;
1483                 } else if (physmegs < 2048) {
1484                         /* 50% of memory */
1485                         segpcache_maxwindow = availrmem >> 1;
1486                 } else {
1487                         /* no limit */
1488                         segpcache_maxwindow = (pgcnt_t)-1;
1489                 }
1490         }
1491         seg_pmaxwindow = segpcache_maxwindow;
1492         seg_pinit_mem_config();
1493 }
1494
1495 /*
1496  * called by pageout if memory is low
1497  */
1498 void
1499 seg_preap(void)
1500 {
1501         /*
1502          * if the cache is off or empty, return
1503          */
1504         if (seg_plocked_window == 0) {
1505                 return;
1506         }
1507         ASSERT(seg_phashsize_win != 0);
1508
1509         /*
1510          * If somebody is already purging pcache
1511          * just return.
1512          */
1513         if (seg_pdisabled) {
1514                 return;
1515         }
1516
1517         cv_signal(&seg_pasync_cv);
1518 }
1519
1520 /*
1521  * run as a backgroud thread and reclaim pagelock
1522  * pages which have not been used recently
1523  */
1524 void
1525 seg_pasync_thread(void)
1526 {
1527         callb_cpr_t cpr_info;
1528
1529         if (seg_phashsize_win == 0) {
1530                 thread_exit();
1531                 /*NOTREACHED*/
1532         }
1533
1534         seg_pasync_thr = curthread;
1535
1536         CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1537             callb_generic_cpr, "seg_pasync");
1538
1539         if (segpcache_reap_ticks <= 0) {
1540                 segpcache_reap_ticks = segpcache_reap_sec * hz;
1541         }
1542
1543         mutex_enter(&seg_pasync_mtx);
1544         for (;;) {
1545                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1546                 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1547                     segpcache_reap_ticks, TR_CLOCK_TICK);
1548                 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1549                 if (seg_pdisabled == 0) {
1550                         seg_ppurge_async(0);
1551                 }
1552         }
1553 }
1554
1555 static struct kmem_cache *seg_cache;
1556
1557 /*
1558  * Initialize segment management data structures.
1559  */
1560 void
1561 seg_init(void)
1562 {
1563         kstat_t *ksp;
1564
1565         seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1566             0, NULL, NULL, NULL, NULL, NULL, 0);
1567
1568         ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1569             segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1570         if (ksp) {
1571                 ksp->ks_data = (void *)segadvstat_ptr;
1572                 kstat_install(ksp);
1573         }
1574
1575         seg_pinit();
1576 }
1577
1578 /*
1579  * Allocate a segment to cover [base, base+size]
1580  * and attach it to the specified address space.
1581  */
1582 struct seg *
1583 seg_alloc(struct as *as, caddr_t base, size_t size)
1584 {
1585         struct seg *new;
1586         caddr_t segbase;
1587         size_t segsize;
1588
1589         segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1590         segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1591             (uintptr_t)segbase;
1592
1593         if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1594                 return ((struct seg *)NULL);    /* bad virtual addr range */
1595
1596         if (as != &kas &&
1597             valid_usr_range(segbase, segsize, 0, as,
1598             as->a_userlimit) != RANGE_OKAY)
1599                 return ((struct seg *)NULL);    /* bad virtual addr range */
1600
1601         new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1602         new->s_ops = NULL;
1603         new->s_data = NULL;
1604         new->s_szc = 0;
1605         new->s_flags = 0;
1606         mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1607         new->s_phead.p_lnext = &new->s_phead;
1608         new->s_phead.p_lprev = &new->s_phead;
1609         if (seg_attach(as, segbase, segsize, new) < 0) {
1610                 kmem_cache_free(seg_cache, new);
1611                 return ((struct seg *)NULL);
1612         }
1613         /* caller must fill in ops, data */
1614         return (new);
1615 }
1616
1617 /*
1618  * Attach a segment to the address space.  Used by seg_alloc()
1619  * and for kernel startup to attach to static segments.
1620  */
1621 int
1622 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1623 {
1624         seg->s_as = as;
1625         seg->s_base = base;
1626         seg->s_size = size;
1627
1628         /*
1629          * as_addseg() will add the segment at the appropraite point
1630          * in the list. It will return -1 if there is overlap with
1631          * an already existing segment.
1632          */
1633         return (as_addseg(as, seg));
1634 }
1635
1636 /*
1637  * Unmap a segment and free it from its associated address space.
1638  * This should be called by anybody who's finished with a whole segment's
1639  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1640  * responsibility of the segment driver to unlink the the segment
1641  * from the address space, and to free public and private data structures
1642  * associated with the segment.  (This is typically done by a call to
1643  * seg_free()).
1644  */
1645 void
1646 seg_unmap(struct seg *seg)
1647 {
1648 #ifdef DEBUG
1649         int ret;
1650 #endif /* DEBUG */
1651
1652         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1653
1654         /* Shouldn't have called seg_unmap if mapping isn't yet established */
1655         ASSERT(seg->s_data != NULL);
1656
1657         /* Unmap the whole mapping */
1658 #ifdef DEBUG
1659         ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1660         ASSERT(ret == 0);
1661 #else
1662         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1663 #endif /* DEBUG */
1664 }
1665
1666 /*
1667  * Free the segment from its associated as. This should only be called
1668  * if a mapping to the segment has not yet been established (e.g., if
1669  * an error occurs in the middle of doing an as_map when the segment
1670  * has already been partially set up) or if it has already been deleted
1671  * (e.g., from a segment driver unmap routine if the unmap applies to the
1672  * entire segment). If the mapping is currently set up then seg_unmap() should
1673  * be called instead.
1674  */
1675 void
1676 seg_free(struct seg *seg)
1677 {
1678         register struct as *as = seg->s_as;
1679         struct seg *tseg = as_removeseg(as, seg);
1680
1681         ASSERT(tseg == seg);
1682
1683         /*
1684          * If the segment private data field is NULL,
1685          * then segment driver is not attached yet.
1686          */
1687         if (seg->s_data != NULL)
1688                 SEGOP_FREE(seg);
1689
1690         mutex_destroy(&seg->s_pmtx);
1691         ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1692         ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1693         kmem_cache_free(seg_cache, seg);
1694 }
1695
1696 /*ARGSUSED*/
1697 static void
1698 seg_p_mem_config_post_add(
1699         void *arg,
1700         pgcnt_t delta_pages)
1701 {
1702         /* Nothing to do. */
1703 }
1704
1705 void
1706 seg_p_enable(void)
1707 {
1708         mutex_enter(&seg_pcache_mtx);
1709         ASSERT(seg_pdisabled != 0);
1710         seg_pdisabled--;
1711         mutex_exit(&seg_pcache_mtx);
1712 }
1713
1714 /*
1715  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1716  * cache.
1717  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1718  * SEGP_FAIL if the cache could not be emptied.
1719  */
1720 int
1721 seg_p_disable(void)
1722 {
1723         pgcnt_t old_plocked;
1724         int stall_count = 0;
1725
1726         mutex_enter(&seg_pcache_mtx);
1727         seg_pdisabled++;
1728         ASSERT(seg_pdisabled != 0);
1729         mutex_exit(&seg_pcache_mtx);
1730
1731         /*
1732          * Attempt to empty the cache. Terminate if seg_plocked does not
1733          * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1734          */
1735         while (seg_plocked != 0) {
1736                 ASSERT(seg_phashsize_win != 0);
1737                 old_plocked = seg_plocked;
1738                 seg_ppurge_async(1);
1739                 if (seg_plocked == old_plocked) {
1740                         if (stall_count++ > SEGP_STALL_THRESHOLD) {
1741                                 return (SEGP_FAIL);
1742                         }
1743                 } else
1744                         stall_count = 0;
1745                 if (seg_plocked != 0)
1746                         delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1747         }
1748         return (SEGP_SUCCESS);
1749 }
1750
1751 /*
1752  * Attempt to purge seg_pcache.  May need to return before this has
1753  * completed to allow other pre_del callbacks to unlock pages. This is
1754  * ok because:
1755  *      1) The seg_pdisabled flag has been set so at least we won't
1756  *      cache anymore locks and the locks we couldn't purge
1757  *      will not be held if they do get released by a subsequent
1758  *      pre-delete callback.
1759  *
1760  *      2) The rest of the memory delete thread processing does not
1761  *      depend on the changes made in this pre-delete callback. No
1762  *      panics will result, the worst that will happen is that the
1763  *      DR code will timeout and cancel the delete.
1764  */
1765 /*ARGSUSED*/
1766 static int
1767 seg_p_mem_config_pre_del(
1768         void *arg,
1769         pgcnt_t delta_pages)
1770 {
1771         if (seg_phashsize_win == 0) {
1772                 return (0);
1773         }
1774         if (seg_p_disable() != SEGP_SUCCESS)
1775                 cmn_err(CE_NOTE,
1776                     "!Pre-delete couldn't purge"" pagelock cache - continuing");
1777         return (0);
1778 }
1779
1780 /*ARGSUSED*/
1781 static void
1782 seg_p_mem_config_post_del(
1783         void *arg,
1784         pgcnt_t delta_pages,
1785         int cancelled)
1786 {
1787         if (seg_phashsize_win == 0) {
1788                 return;
1789         }
1790         seg_p_enable();
1791 }
1792
1793 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1794         KPHYSM_SETUP_VECTOR_VERSION,
1795         seg_p_mem_config_post_add,
1796         seg_p_mem_config_pre_del,
1797         seg_p_mem_config_post_del,
1798 };
1799
1800 static void
1801 seg_pinit_mem_config(void)
1802 {
1803         int ret;
1804
1805         ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1806         /*
1807          * Want to catch this in the debug kernel. At run time, if the
1808          * callbacks don't get run all will be OK as the disable just makes
1809          * it more likely that the pages can be collected.
1810          */
1811         ASSERT(ret == 0);
1812 }
1813
1814 /*
1815  * Verify that segment is not a shared anonymous segment which reserves
1816  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1817  * from one zone to another if any segments are shared.  This is because the
1818  * last process to exit will credit the swap reservation.  This could lead
1819  * to the swap being reserved by one zone, and credited to another.
1820  */
1821 boolean_t
1822 seg_can_change_zones(struct seg *seg)
1823 {
1824         struct segvn_data *svd;
1825
1826         if (seg->s_ops == &segspt_shmops)
1827                 return (B_FALSE);
1828
1829         if (seg->s_ops == &segvn_ops) {
1830                 svd = (struct segvn_data *)seg->s_data;
1831                 if (svd->type == MAP_SHARED &&
1832                     svd->amp != NULL &&
1833                     svd->amp->swresv > 0)
1834                 return (B_FALSE);
1835         }
1836         return (B_TRUE);
1837 }
1838
1839 /*
1840  * Return swap reserved by a segment backing a private mapping.
1841  */
1842 size_t
1843 seg_swresv(struct seg *seg)
1844 {
1845         struct segvn_data *svd;
1846         size_t swap = 0;
1847
1848         if (seg->s_ops == &segvn_ops) {
1849                 svd = (struct segvn_data *)seg->s_data;
1850                 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1851                         swap = svd->swresv;
1852         }
1853         return (swap);
1854 }
1855
1856 /*
1857  * General not supported function for SEGOP_INHERIT
1858  */
1859 /* ARGSUSED */
1860 int
1861 seg_inherit_notsup(struct seg *seg, caddr_t addr, size_t len, uint_t op)
1862 {
1863         return (ENOTSUP);
1864 }