kernel/vm/vm_seg.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
  25  * Copyright (c) 2018, Joyent, Inc.
  26  */
  27
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40
  41 /*
  42  * VM - segment management.
  43  */
  44
  45 #include <sys/types.h>
  46 #include <sys/inttypes.h>
  47 #include <sys/t_lock.h>
  48 #include <sys/param.h>
  49 #include <sys/systm.h>
  50 #include <sys/kmem.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/tuneable.h>
  54 #include <sys/debug.h>
  55 #include <sys/fs/swapnode.h>
  56 #include <sys/cmn_err.h>
  57 #include <sys/callb.h>
  58 #include <sys/mem_config.h>
  59 #include <sys/mman.h>
  60
  61 #include <vm/hat.h>
  62 #include <vm/as.h>
  63 #include <vm/seg.h>
  64 #include <vm/seg_kmem.h>
  65 #include <vm/seg_spt.h>
  66 #include <vm/seg_vn.h>
  67 #include <vm/anon.h>
  68
  69 /*
  70  * kstats for segment advise
  71  */
  72 segadvstat_t segadvstat = {
  73         { "MADV_FREE_hit",      KSTAT_DATA_ULONG },
  74         { "MADV_FREE_miss",     KSTAT_DATA_ULONG },
  75 };
  76
  77 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
  78 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
  79
  80 /*
  81  * entry in the segment page cache
  82  */
  83 struct seg_pcache {
  84         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
  85         struct seg_pcache       *p_hprev;
  86         pcache_link_t           p_plink;        /* per segment/amp list */
  87         void                    *p_htag0;       /* segment/amp pointer */
  88         caddr_t                 p_addr;         /* base address/anon_idx */
  89         size_t                  p_len;          /* total bytes */
  90         size_t                  p_wlen;         /* writtable bytes at p_addr */
  91         struct page             **p_pp;         /* pp shadow list */
  92         seg_preclaim_cbfunc_t   p_callback;     /* reclaim callback function */
  93         clock_t                 p_lbolt;        /* lbolt from last use */
  94         struct seg_phash        *p_hashp;       /* our pcache hash bucket */
  95         uint_t                  p_active;       /* active count */
  96         uchar_t                 p_write;        /* true if S_WRITE */
  97         uchar_t                 p_ref;          /* reference byte */
  98         ushort_t                p_flags;        /* bit flags */
  99 };
 100
 101 struct seg_phash {
 102         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 103         struct seg_pcache       *p_hprev;
 104         kmutex_t                p_hmutex;       /* protects hash bucket */
 105         pcache_link_t           p_halink[2];    /* active bucket linkages */
 106 };
 107
 108 struct seg_phash_wired {
 109         struct seg_pcache       *p_hnext;       /* list for hashed blocks */
 110         struct seg_pcache       *p_hprev;
 111         kmutex_t                p_hmutex;       /* protects hash bucket */
 112 };
 113
 114 /*
 115  * A parameter to control a maximum number of bytes that can be
 116  * purged from pcache at a time.
 117  */
 118 #define P_MAX_APURGE_BYTES      (1024 * 1024 * 1024)
 119
 120 /*
 121  * log2(fraction of pcache to reclaim at a time).
 122  */
 123 #define P_SHRINK_SHFT           (5)
 124
 125 /*
 126  * The following variables can be tuned via /etc/system.
 127  */
 128
 129 int     segpcache_enabled = 1;          /* if 1, shadow lists are cached */
 130 pgcnt_t segpcache_maxwindow = 0;        /* max # of pages that can be cached */
 131 ulong_t segpcache_hashsize_win = 0;     /* # of non wired buckets */
 132 ulong_t segpcache_hashsize_wired = 0;   /* # of wired buckets */
 133 int     segpcache_reap_sec = 1;         /* reap check rate in secs */
 134 clock_t segpcache_reap_ticks = 0;       /* reap interval in ticks */
 135 int     segpcache_pcp_maxage_sec = 1;   /* pcp max age in secs */
 136 clock_t segpcache_pcp_maxage_ticks = 0; /* pcp max age in ticks */
 137 int     segpcache_shrink_shift = P_SHRINK_SHFT; /* log2 reap fraction */
 138 pgcnt_t segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES; /* max purge bytes */
 139
 140 static kmutex_t seg_pcache_mtx; /* protects seg_pdisabled counter */
 141 static kmutex_t seg_pasync_mtx; /* protects async thread scheduling */
 142 static kcondvar_t seg_pasync_cv;
 143
 144 #pragma align 64(pctrl1)
 145 #pragma align 64(pctrl2)
 146 #pragma align 64(pctrl3)
 147
 148 /*
 149  * Keep frequently used variables together in one cache line.
 150  */
 151 static struct p_ctrl1 {
 152         uint_t p_disabled;              /* if not 0, caching temporarily off */
 153         pgcnt_t p_maxwin;               /* max # of pages that can be cached */
 154         size_t p_hashwin_sz;            /* # of non wired buckets */
 155         struct seg_phash *p_htabwin;    /* hash table for non wired entries */
 156         size_t p_hashwired_sz;          /* # of wired buckets */
 157         struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
 158         kmem_cache_t *p_kmcache;        /* kmem cache for seg_pcache structs */
 159 #ifdef _LP64
 160         ulong_t pad[1];
 161 #endif /* _LP64 */
 162 } pctrl1;
 163
 164 static struct p_ctrl2 {
 165         kmutex_t p_mem_mtx;     /* protects window counter and p_halinks */
 166         pgcnt_t  p_locked_win;  /* # pages from window */
 167         pgcnt_t  p_locked;      /* # of pages cached by pagelock */
 168         uchar_t  p_ahcur;       /* current active links for insert/delete */
 169         uchar_t  p_athr_on;     /* async reclaim thread is running. */
 170         pcache_link_t p_ahhead[2]; /* active buckets linkages */
 171 } pctrl2;
 172
 173 static struct p_ctrl3 {
 174         clock_t p_pcp_maxage;           /* max pcp age in ticks */
 175         ulong_t p_athr_empty_ahb;       /* athread walk stats */
 176         ulong_t p_athr_full_ahb;        /* athread walk stats */
 177         pgcnt_t p_maxapurge_npages;     /* max pages to purge at a time */
 178         int     p_shrink_shft;          /* reap shift factor */
 179 #ifdef _LP64
 180         ulong_t pad[3];
 181 #endif /* _LP64 */
 182 } pctrl3;
 183
 184 #define seg_pdisabled                   pctrl1.p_disabled
 185 #define seg_pmaxwindow                  pctrl1.p_maxwin
 186 #define seg_phashsize_win               pctrl1.p_hashwin_sz
 187 #define seg_phashtab_win                pctrl1.p_htabwin
 188 #define seg_phashsize_wired             pctrl1.p_hashwired_sz
 189 #define seg_phashtab_wired              pctrl1.p_htabwired
 190 #define seg_pkmcache                    pctrl1.p_kmcache
 191 #define seg_pmem_mtx                    pctrl2.p_mem_mtx
 192 #define seg_plocked_window              pctrl2.p_locked_win
 193 #define seg_plocked                     pctrl2.p_locked
 194 #define seg_pahcur                      pctrl2.p_ahcur
 195 #define seg_pathr_on                    pctrl2.p_athr_on
 196 #define seg_pahhead                     pctrl2.p_ahhead
 197 #define seg_pmax_pcpage                 pctrl3.p_pcp_maxage
 198 #define seg_pathr_empty_ahb             pctrl3.p_athr_empty_ahb
 199 #define seg_pathr_full_ahb              pctrl3.p_athr_full_ahb
 200 #define seg_pshrink_shift               pctrl3.p_shrink_shft
 201 #define seg_pmaxapurge_npages           pctrl3.p_maxapurge_npages
 202
 203 #define P_HASHWIN_MASK                  (seg_phashsize_win - 1)
 204 #define P_HASHWIRED_MASK                (seg_phashsize_wired - 1)
 205 #define P_BASESHIFT                     (6)
 206
 207 kthread_t *seg_pasync_thr;
 208
 209 extern const struct seg_ops segvn_ops;
 210 extern const struct seg_ops segspt_shmops;
 211
 212 #define IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
 213 #define IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
 214
 215 #define LBOLT_DELTA(t)  ((ulong_t)(ddi_get_lbolt() - (t)))
 216
 217 #define PCP_AGE(pcp)    LBOLT_DELTA((pcp)->p_lbolt)
 218
 219 /*
 220  * htag0 argument can be a seg or amp pointer.
 221  */
 222 #define P_HASHBP(seg, htag0, addr, flags)                               \
 223         (IS_PFLAGS_WIRED((flags)) ?                                     \
 224             ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK & \
 225             ((uintptr_t)(htag0) >> P_BASESHIFT)]) :                     \
 226             (&seg_phashtab_win[P_HASHWIN_MASK &                         \
 227             (((uintptr_t)(htag0) >> 3) ^                                \
 228             ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?              \
 229             (flags >> 16) : page_get_shift((seg)->s_szc))))]))
 230
 231 /*
 232  * htag0 argument can be a seg or amp pointer.
 233  */
 234 #define P_MATCH(pcp, htag0, addr, len)                                  \
 235         ((pcp)->p_htag0 == (htag0) &&                                   \
 236         (pcp)->p_addr == (addr) &&                                      \
 237         (pcp)->p_len >= (len))
 238
 239 #define P_MATCH_PP(pcp, htag0, addr, len, pp)                           \
 240         ((pcp)->p_pp == (pp) &&                                         \
 241         (pcp)->p_htag0 == (htag0) &&                                    \
 242         (pcp)->p_addr == (addr) &&                                      \
 243         (pcp)->p_len >= (len))
 244
 245 #define plink2pcache(pl)        ((struct seg_pcache *)((uintptr_t)(pl) - \
 246     offsetof(struct seg_pcache, p_plink)))
 247
 248 #define hlink2phash(hl, l)      ((struct seg_phash *)((uintptr_t)(hl) - \
 249     offsetof(struct seg_phash, p_halink[l])))
 250
 251 /*
 252  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
 253  * active hash bucket lists. We maintain active bucket lists to reduce the
 254  * overhead of finding active buckets during asynchronous purging since there
 255  * can be 10s of millions of buckets on a large system but only a small subset
 256  * of them in actual use.
 257  *
 258  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
 259  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
 260  * buckets. The other list is used by asynchronous purge thread. This allows
 261  * the purge thread to walk its active list without holding seg_pmem_mtx for a
 262  * long time. When asynchronous thread is done with its list it switches to
 263  * current active list and makes the list it just finished processing as
 264  * current active list.
 265  *
 266  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
 267  * yet on any list.  seg_premove_abuck() may remove the bucket from either
 268  * list. If the bucket is on current list it will be always removed. Otherwise
 269  * the bucket is only removed if asynchronous purge thread is not currently
 270  * running or seg_premove_abuck() is called by asynchronous purge thread
 271  * itself. A given bucket can only be on one of active lists at a time. These
 272  * routines should be called with per bucket lock held.  The routines use
 273  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
 274  * the first entry is added to the bucket chain and seg_premove_abuck() must
 275  * be called after the last pcp entry is deleted from its chain. Per bucket
 276  * lock should be held by the callers.  This avoids a potential race condition
 277  * when seg_premove_abuck() removes a bucket after pcp entries are added to
 278  * its list after the caller checked that the bucket has no entries. (this
 279  * race would cause a loss of an active bucket from the active lists).
 280  *
 281  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
 282  * New entries are added to the end of the list since LRU is used as the
 283  * purging policy.
 284  */
 285 static void
 286 seg_padd_abuck(struct seg_phash *hp)
 287 {
 288         int lix;
 289
 290         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 291         ASSERT((struct seg_phash *)hp->p_hnext != hp);
 292         ASSERT((struct seg_phash *)hp->p_hprev != hp);
 293         ASSERT(hp->p_hnext == hp->p_hprev);
 294         ASSERT(!IS_PCP_WIRED(hp->p_hnext));
 295         ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
 296         ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
 297         ASSERT(hp >= seg_phashtab_win &&
 298             hp < &seg_phashtab_win[seg_phashsize_win]);
 299
 300         /*
 301          * This bucket can already be on one of active lists
 302          * since seg_premove_abuck() may have failed to remove it
 303          * before.
 304          */
 305         mutex_enter(&seg_pmem_mtx);
 306         lix = seg_pahcur;
 307         ASSERT(lix >= 0 && lix <= 1);
 308         if (hp->p_halink[lix].p_lnext != NULL) {
 309                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 310                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 311                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 312                 mutex_exit(&seg_pmem_mtx);
 313                 return;
 314         }
 315         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 316
 317         /*
 318          * If this bucket is still on list !lix async thread can't yet remove
 319          * it since we hold here per bucket lock. In this case just return
 320          * since async thread will eventually find and process this bucket.
 321          */
 322         if (hp->p_halink[!lix].p_lnext != NULL) {
 323                 ASSERT(hp->p_halink[!lix].p_lprev != NULL);
 324                 mutex_exit(&seg_pmem_mtx);
 325                 return;
 326         }
 327         ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 328         /*
 329          * This bucket is not on any active bucket list yet.
 330          * Add the bucket to the tail of current active list.
 331          */
 332         hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
 333         hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
 334         seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
 335         seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
 336         mutex_exit(&seg_pmem_mtx);
 337 }
 338
 339 static void
 340 seg_premove_abuck(struct seg_phash *hp, int athr)
 341 {
 342         int lix;
 343
 344         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 345         ASSERT((struct seg_phash *)hp->p_hnext == hp);
 346         ASSERT((struct seg_phash *)hp->p_hprev == hp);
 347         ASSERT(hp >= seg_phashtab_win &&
 348             hp < &seg_phashtab_win[seg_phashsize_win]);
 349
 350         if (athr) {
 351                 ASSERT(seg_pathr_on);
 352                 ASSERT(seg_pahcur <= 1);
 353                 /*
 354                  * We are called by asynchronous thread that found this bucket
 355                  * on not currently active (i.e. !seg_pahcur) list. Remove it
 356                  * from there.  Per bucket lock we are holding makes sure
 357                  * seg_pinsert() can't sneak in and add pcp entries to this
 358                  * bucket right before we remove the bucket from its list.
 359                  */
 360                 lix = !seg_pahcur;
 361                 ASSERT(hp->p_halink[lix].p_lnext != NULL);
 362                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 363                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 364                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 365                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 366                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 367                 hp->p_halink[lix].p_lnext = NULL;
 368                 hp->p_halink[lix].p_lprev = NULL;
 369                 return;
 370         }
 371
 372         mutex_enter(&seg_pmem_mtx);
 373         lix = seg_pahcur;
 374         ASSERT(lix >= 0 && lix <= 1);
 375
 376         /*
 377          * If the bucket is on currently active list just remove it from
 378          * there.
 379          */
 380         if (hp->p_halink[lix].p_lnext != NULL) {
 381                 ASSERT(hp->p_halink[lix].p_lprev != NULL);
 382                 ASSERT(hp->p_halink[!lix].p_lnext == NULL);
 383                 ASSERT(hp->p_halink[!lix].p_lprev == NULL);
 384                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 385                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 386                 hp->p_halink[lix].p_lnext = NULL;
 387                 hp->p_halink[lix].p_lprev = NULL;
 388                 mutex_exit(&seg_pmem_mtx);
 389                 return;
 390         }
 391         ASSERT(hp->p_halink[lix].p_lprev == NULL);
 392
 393         /*
 394          * If asynchronous thread is not running we can remove the bucket from
 395          * not currently active list. The bucket must be on this list since we
 396          * already checked that it's not on the other list and the bucket from
 397          * which we just deleted the last pcp entry must be still on one of the
 398          * active bucket lists.
 399          */
 400         lix = !lix;
 401         ASSERT(hp->p_halink[lix].p_lnext != NULL);
 402         ASSERT(hp->p_halink[lix].p_lprev != NULL);
 403
 404         if (!seg_pathr_on) {
 405                 hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
 406                 hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
 407                 hp->p_halink[lix].p_lnext = NULL;
 408                 hp->p_halink[lix].p_lprev = NULL;
 409         }
 410         mutex_exit(&seg_pmem_mtx);
 411 }
 412
 413 /*
 414  * Check if bucket pointed by hp already has a pcp entry that matches request
 415  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
 416  * Also delete matching entries that cover smaller address range but start
 417  * at the same address as addr argument. Return the list of deleted entries if
 418  * any. This is an internal helper function called from seg_pinsert() only
 419  * for non wired shadow lists. The caller already holds a per seg/amp list
 420  * lock.
 421  */
 422 static struct seg_pcache *
 423 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
 424     caddr_t addr, size_t len, int *found)
 425 {
 426         struct seg_pcache *pcp;
 427         struct seg_pcache *delcallb_list = NULL;
 428
 429         ASSERT(MUTEX_HELD(&hp->p_hmutex));
 430
 431         *found = 0;
 432         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 433             pcp = pcp->p_hnext) {
 434                 ASSERT(pcp->p_hashp == hp);
 435                 if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
 436                         ASSERT(!IS_PCP_WIRED(pcp));
 437                         if (pcp->p_len < len) {
 438                                 pcache_link_t *plinkp;
 439                                 if (pcp->p_active) {
 440                                         continue;
 441                                 }
 442                                 plinkp = &pcp->p_plink;
 443                                 plinkp->p_lprev->p_lnext = plinkp->p_lnext;
 444                                 plinkp->p_lnext->p_lprev = plinkp->p_lprev;
 445                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 446                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 447                                 pcp->p_hprev = delcallb_list;
 448                                 delcallb_list = pcp;
 449                         } else {
 450                                 *found = 1;
 451                                 break;
 452                         }
 453                 }
 454         }
 455         return (delcallb_list);
 456 }
 457
 458 /*
 459  * lookup an address range in pagelock cache. Return shadow list and bump up
 460  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
 461  * as a lookup tag.
 462  */
 463 struct page **
 464 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 465     enum seg_rw rw, uint_t flags)
 466 {
 467         struct seg_pcache *pcp;
 468         struct seg_phash *hp;
 469         void *htag0;
 470
 471         ASSERT(seg != NULL);
 472         ASSERT(rw == S_READ || rw == S_WRITE);
 473
 474         /*
 475          * Skip pagelock cache, while DR is in progress or
 476          * seg_pcache is off.
 477          */
 478         if (seg_pdisabled) {
 479                 return (NULL);
 480         }
 481         ASSERT(seg_phashsize_win != 0);
 482
 483         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 484         hp = P_HASHBP(seg, htag0, addr, flags);
 485         mutex_enter(&hp->p_hmutex);
 486         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 487             pcp = pcp->p_hnext) {
 488                 ASSERT(pcp->p_hashp == hp);
 489                 if (P_MATCH(pcp, htag0, addr, len)) {
 490                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 491                         /*
 492                          * If this request wants to write pages
 493                          * but write permissions starting from
 494                          * addr don't cover the entire length len
 495                          * return lookup failure back to the caller.
 496                          * It will check protections and fail this
 497                          * pagelock operation with EACCESS error.
 498                          */
 499                         if (rw == S_WRITE && pcp->p_wlen < len) {
 500                                 break;
 501                         }
 502                         if (pcp->p_active == UINT_MAX) {
 503                                 break;
 504                         }
 505                         pcp->p_active++;
 506                         if (rw == S_WRITE && !pcp->p_write) {
 507                                 pcp->p_write = 1;
 508                         }
 509                         mutex_exit(&hp->p_hmutex);
 510                         return (pcp->p_pp);
 511                 }
 512         }
 513         mutex_exit(&hp->p_hmutex);
 514         return (NULL);
 515 }
 516
 517 /*
 518  * mark address range inactive. If the cache is off or the address range is
 519  * not in the cache or another shadow list that covers bigger range is found
 520  * we call the segment driver to reclaim the pages. Otherwise just decrement
 521  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
 522  * otherwise use seg as a lookup tag.
 523  */
 524 void
 525 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
 526     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
 527     seg_preclaim_cbfunc_t callback)
 528 {
 529         struct seg_pcache *pcp;
 530         struct seg_phash *hp;
 531         kmutex_t *pmtx = NULL;
 532         pcache_link_t *pheadp;
 533         void *htag0;
 534         pgcnt_t npages = 0;
 535         int keep = 0;
 536
 537         ASSERT(seg != NULL);
 538         ASSERT(rw == S_READ || rw == S_WRITE);
 539
 540         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
 541
 542         /*
 543          * Skip lookup if pcache is not configured.
 544          */
 545         if (seg_phashsize_win == 0) {
 546                 goto out;
 547         }
 548
 549         /*
 550          * Grab per seg/amp lock before hash lock if we are going to remove
 551          * inactive entry from pcache.
 552          */
 553         if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
 554                 if (amp == NULL) {
 555                         pheadp = &seg->s_phead;
 556                         pmtx = &seg->s_pmtx;
 557                 } else {
 558                         pheadp = &amp->a_phead;
 559                         pmtx = &amp->a_pmtx;
 560                 }
 561                 mutex_enter(pmtx);
 562         }
 563
 564         hp = P_HASHBP(seg, htag0, addr, flags);
 565         mutex_enter(&hp->p_hmutex);
 566 again:
 567         for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
 568             pcp = pcp->p_hnext) {
 569                 ASSERT(pcp->p_hashp == hp);
 570                 if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
 571                         ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
 572                         ASSERT(pcp->p_active);
 573                         if (keep) {
 574                                 /*
 575                                  * Don't remove this pcp entry
 576                                  * if we didn't find duplicate
 577                                  * shadow lists on second search.
 578                                  * Somebody removed those duplicates
 579                                  * since we dropped hash lock after first
 580                                  * search.
 581                                  */
 582                                 ASSERT(pmtx != NULL);
 583                                 ASSERT(!IS_PFLAGS_WIRED(flags));
 584                                 mutex_exit(pmtx);
 585                                 pmtx = NULL;
 586                         }
 587                         pcp->p_active--;
 588                         if (pcp->p_active == 0 && (pmtx != NULL ||
 589                             (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
 590
 591                                 /*
 592                                  * This entry is no longer active.  Remove it
 593                                  * now either because pcaching is temporarily
 594                                  * disabled or there're other pcp entries that
 595                                  * can match this pagelock request (i.e. this
 596                                  * entry is a duplicate).
 597                                  */
 598
 599                                 ASSERT(callback == pcp->p_callback);
 600                                 if (pmtx != NULL) {
 601                                         pcache_link_t *plinkp = &pcp->p_plink;
 602                                         ASSERT(!IS_PCP_WIRED(pcp));
 603                                         ASSERT(pheadp->p_lnext != pheadp);
 604                                         ASSERT(pheadp->p_lprev != pheadp);
 605                                         plinkp->p_lprev->p_lnext =
 606                                             plinkp->p_lnext;
 607                                         plinkp->p_lnext->p_lprev =
 608                                             plinkp->p_lprev;
 609                                 }
 610                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
 611                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
 612                                 if (!IS_PCP_WIRED(pcp) &&
 613                                     hp->p_hnext == (struct seg_pcache *)hp) {
 614                                         /*
 615                                          * We removed the last entry from this
 616                                          * bucket.  Now remove the bucket from
 617                                          * its active list.
 618                                          */
 619                                         seg_premove_abuck(hp, 0);
 620                                 }
 621                                 mutex_exit(&hp->p_hmutex);
 622                                 if (pmtx != NULL) {
 623                                         mutex_exit(pmtx);
 624                                 }
 625                                 len = pcp->p_len;
 626                                 npages = btop(len);
 627                                 if (rw != S_WRITE && pcp->p_write) {
 628                                         rw = S_WRITE;
 629                                 }
 630                                 kmem_cache_free(seg_pkmcache, pcp);
 631                                 goto out;
 632                         } else {
 633                                 /*
 634                                  * We found a matching pcp entry but will not
 635                                  * free it right away even if it's no longer
 636                                  * active.
 637                                  */
 638                                 if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
 639                                         /*
 640                                          * Set the reference bit and mark the
 641                                          * time of last access to this pcp
 642                                          * so that asynchronous thread doesn't
 643                                          * free it immediately since
 644                                          * it may be reactivated very soon.
 645                                          */
 646                                         pcp->p_lbolt = ddi_get_lbolt();
 647                                         pcp->p_ref = 1;
 648                                 }
 649                                 mutex_exit(&hp->p_hmutex);
 650                                 if (pmtx != NULL) {
 651                                         mutex_exit(pmtx);
 652                                 }
 653                                 return;
 654                         }
 655                 } else if (!IS_PFLAGS_WIRED(flags) &&
 656                     P_MATCH(pcp, htag0, addr, len)) {
 657                         /*
 658                          * This is a duplicate pcp entry.  This situation may
 659                          * happen if a bigger shadow list that covers our
 660                          * range was added while our entry was still active.
 661                          * Now we can free our pcp entry if it becomes
 662                          * inactive.
 663                          */
 664                         if (!pcp->p_active) {
 665                                 /*
 666                                  * Mark this entry as referenced just in case
 667                                  * we'll free our own pcp entry soon.
 668                                  */
 669                                 pcp->p_lbolt = ddi_get_lbolt();
 670                                 pcp->p_ref = 1;
 671                         }
 672                         if (pmtx != NULL) {
 673                                 /*
 674                                  * we are already holding pmtx and found a
 675                                  * duplicate.  Don't keep our own pcp entry.
 676                                  */
 677                                 keep = 0;
 678                                 continue;
 679                         }
 680                         /*
 681                          * We have to use mutex_tryenter to attempt to lock
 682                          * seg/amp list lock since we already hold hash lock
 683                          * and seg/amp list lock is above hash lock in lock
 684                          * order.  If mutex_tryenter fails drop hash lock and
 685                          * retake both locks in correct order and research
 686                          * this hash chain.
 687                          */
 688                         ASSERT(keep == 0);
 689                         if (amp == NULL) {
 690                                 pheadp = &seg->s_phead;
 691                                 pmtx = &seg->s_pmtx;
 692                         } else {
 693                                 pheadp = &amp->a_phead;
 694                                 pmtx = &amp->a_pmtx;
 695                         }
 696                         if (!mutex_tryenter(pmtx)) {
 697                                 mutex_exit(&hp->p_hmutex);
 698                                 mutex_enter(pmtx);
 699                                 mutex_enter(&hp->p_hmutex);
 700                                 /*
 701                                  * If we don't find bigger shadow list on
 702                                  * second search (it may happen since we
 703                                  * dropped bucket lock) keep the entry that
 704                                  * matches our own shadow list.
 705                                  */
 706                                 keep = 1;
 707                                 goto again;
 708                         }
 709                 }
 710         }
 711         mutex_exit(&hp->p_hmutex);
 712         if (pmtx != NULL) {
 713                 mutex_exit(pmtx);
 714         }
 715 out:
 716         (*callback)(htag0, addr, len, pp, rw, 0);
 717         if (npages) {
 718                 mutex_enter(&seg_pmem_mtx);
 719                 ASSERT(seg_plocked >= npages);
 720                 seg_plocked -= npages;
 721                 if (!IS_PFLAGS_WIRED(flags)) {
 722                         ASSERT(seg_plocked_window >= npages);
 723                         seg_plocked_window -= npages;
 724                 }
 725                 mutex_exit(&seg_pmem_mtx);
 726         }
 727
 728 }
 729
 730 #ifdef DEBUG
 731 static uint32_t p_insert_chk_mtbf = 0;
 732 #endif
 733
 734 /*
 735  * The seg_pinsert_check() is used by segment drivers to predict whether
 736  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
 737  */
 738 /*ARGSUSED*/
 739 int
 740 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
 741     size_t len, uint_t flags)
 742 {
 743         ASSERT(seg != NULL);
 744
 745 #ifdef DEBUG
 746         if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
 747                 return (SEGP_FAIL);
 748         }
 749 #endif
 750
 751         if (seg_pdisabled) {
 752                 return (SEGP_FAIL);
 753         }
 754         ASSERT(seg_phashsize_win != 0);
 755
 756         if (IS_PFLAGS_WIRED(flags)) {
 757                 return (SEGP_SUCCESS);
 758         }
 759
 760         if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
 761                 return (SEGP_FAIL);
 762         }
 763
 764         if (freemem < desfree) {
 765                 return (SEGP_FAIL);
 766         }
 767
 768         return (SEGP_SUCCESS);
 769 }
 770
 771 #ifdef DEBUG
 772 static uint32_t p_insert_mtbf = 0;
 773 #endif
 774
 775 /*
 776  * Insert address range with shadow list into pagelock cache if there's no
 777  * shadow list already cached for this address range. If the cache is off or
 778  * caching is temporarily disabled or the allowed 'window' is exceeded return
 779  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
 780  *
 781  * For non wired shadow lists (segvn case) include address in the hashing
 782  * function to avoid linking all the entries from the same segment or amp on
 783  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
 784  * pcache entries are also linked on a per segment/amp list so that all
 785  * entries can be found quickly during seg/amp purge without walking the
 786  * entire pcache hash table.  For wired shadow lists (segspt case) we
 787  * don't use address hashing and per segment linking because the caller
 788  * currently inserts only one entry per segment that covers the entire
 789  * segment. If we used per segment linking even for segspt it would complicate
 790  * seg_ppurge_wiredpp() locking.
 791  *
 792  * Both hash bucket and per seg/amp locks need to be held before adding a non
 793  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
 794  * first.
 795  *
 796  * This function will also remove from pcache old inactive shadow lists that
 797  * overlap with this request but cover smaller range for the same start
 798  * address.
 799  */
 800 int
 801 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
 802     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
 803     seg_preclaim_cbfunc_t callback)
 804 {
 805         struct seg_pcache *pcp;
 806         struct seg_phash *hp;
 807         pgcnt_t npages;
 808         pcache_link_t *pheadp;
 809         kmutex_t *pmtx;
 810         struct seg_pcache *delcallb_list = NULL;
 811
 812         ASSERT(seg != NULL);
 813         ASSERT(rw == S_READ || rw == S_WRITE);
 814         ASSERT(rw == S_READ || wlen == len);
 815         ASSERT(rw == S_WRITE || wlen <= len);
 816         ASSERT(amp == NULL || wlen == len);
 817
 818 #ifdef DEBUG
 819         if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
 820                 return (SEGP_FAIL);
 821         }
 822 #endif
 823
 824         if (seg_pdisabled) {
 825                 return (SEGP_FAIL);
 826         }
 827         ASSERT(seg_phashsize_win != 0);
 828
 829         ASSERT((len & PAGEOFFSET) == 0);
 830         npages = btop(len);
 831         mutex_enter(&seg_pmem_mtx);
 832         if (!IS_PFLAGS_WIRED(flags)) {
 833                 if (seg_plocked_window + npages > seg_pmaxwindow) {
 834                         mutex_exit(&seg_pmem_mtx);
 835                         return (SEGP_FAIL);
 836                 }
 837                 seg_plocked_window += npages;
 838         }
 839         seg_plocked += npages;
 840         mutex_exit(&seg_pmem_mtx);
 841
 842         pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
 843         /*
 844          * If amp is not NULL set htag0 to amp otherwise set it to seg.
 845          */
 846         if (amp == NULL) {
 847                 pcp->p_htag0 = (void *)seg;
 848                 pcp->p_flags = flags & 0xffff;
 849         } else {
 850                 pcp->p_htag0 = (void *)amp;
 851                 pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
 852         }
 853         pcp->p_addr = addr;
 854         pcp->p_len = len;
 855         pcp->p_wlen = wlen;
 856         pcp->p_pp = pp;
 857         pcp->p_write = (rw == S_WRITE);
 858         pcp->p_callback = callback;
 859         pcp->p_active = 1;
 860
 861         hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
 862         if (!IS_PFLAGS_WIRED(flags)) {
 863                 int found;
 864                 void *htag0;
 865                 if (amp == NULL) {
 866                         pheadp = &seg->s_phead;
 867                         pmtx = &seg->s_pmtx;
 868                         htag0 = (void *)seg;
 869                 } else {
 870                         pheadp = &amp->a_phead;
 871                         pmtx = &amp->a_pmtx;
 872                         htag0 = (void *)amp;
 873                 }
 874                 mutex_enter(pmtx);
 875                 mutex_enter(&hp->p_hmutex);
 876                 delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
 877                     len, &found);
 878                 if (found) {
 879                         mutex_exit(&hp->p_hmutex);
 880                         mutex_exit(pmtx);
 881                         mutex_enter(&seg_pmem_mtx);
 882                         seg_plocked -= npages;
 883                         seg_plocked_window -= npages;
 884                         mutex_exit(&seg_pmem_mtx);
 885                         kmem_cache_free(seg_pkmcache, pcp);
 886                         goto out;
 887                 }
 888                 pcp->p_plink.p_lnext = pheadp->p_lnext;
 889                 pcp->p_plink.p_lprev = pheadp;
 890                 pheadp->p_lnext->p_lprev = &pcp->p_plink;
 891                 pheadp->p_lnext = &pcp->p_plink;
 892         } else {
 893                 mutex_enter(&hp->p_hmutex);
 894         }
 895         pcp->p_hashp = hp;
 896         pcp->p_hnext = hp->p_hnext;
 897         pcp->p_hprev = (struct seg_pcache *)hp;
 898         hp->p_hnext->p_hprev = pcp;
 899         hp->p_hnext = pcp;
 900         if (!IS_PFLAGS_WIRED(flags) &&
 901             hp->p_hprev == pcp) {
 902                 seg_padd_abuck(hp);
 903         }
 904         mutex_exit(&hp->p_hmutex);
 905         if (!IS_PFLAGS_WIRED(flags)) {
 906                 mutex_exit(pmtx);
 907         }
 908
 909 out:
 910         npages = 0;
 911         while (delcallb_list != NULL) {
 912                 pcp = delcallb_list;
 913                 delcallb_list = pcp->p_hprev;
 914                 ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
 915                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
 916                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
 917                 npages += btop(pcp->p_len);
 918                 kmem_cache_free(seg_pkmcache, pcp);
 919         }
 920         if (npages) {
 921                 ASSERT(!IS_PFLAGS_WIRED(flags));
 922                 mutex_enter(&seg_pmem_mtx);
 923                 ASSERT(seg_plocked >= npages);
 924                 ASSERT(seg_plocked_window >= npages);
 925                 seg_plocked -= npages;
 926                 seg_plocked_window -= npages;
 927                 mutex_exit(&seg_pmem_mtx);
 928         }
 929
 930         return (SEGP_SUCCESS);
 931 }
 932
 933 /*
 934  * purge entries from the pagelock cache if not active
 935  * and not recently used.
 936  */
 937 static void
 938 seg_ppurge_async(int force)
 939 {
 940         struct seg_pcache *delcallb_list = NULL;
 941         struct seg_pcache *pcp;
 942         struct seg_phash *hp;
 943         pgcnt_t npages = 0;
 944         pgcnt_t npages_window = 0;
 945         pgcnt_t npgs_to_purge;
 946         pgcnt_t npgs_purged = 0;
 947         int hlinks = 0;
 948         int hlix;
 949         pcache_link_t *hlinkp;
 950         pcache_link_t *hlnextp = NULL;
 951         int lowmem;
 952         int trim;
 953
 954         ASSERT(seg_phashsize_win != 0);
 955
 956         /*
 957          * if the cache is off or empty, return
 958          */
 959         if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
 960                 return;
 961         }
 962
 963         if (!force) {
 964                 lowmem = 0;
 965                 trim = 0;
 966                 if (freemem < lotsfree + needfree) {
 967                         spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
 968                         if (fmem <= 5 * (desfree >> 2)) {
 969                                 lowmem = 1;
 970                         } else if (fmem <= 7 * (lotsfree >> 3)) {
 971                                 if (seg_plocked_window >=
 972                                     (availrmem_initial >> 1)) {
 973                                         lowmem = 1;
 974                                 }
 975                         } else if (fmem < lotsfree) {
 976                                 if (seg_plocked_window >=
 977                                     3 * (availrmem_initial >> 2)) {
 978                                         lowmem = 1;
 979                                 }
 980                         }
 981                 }
 982                 if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
 983                         trim = 1;
 984                 }
 985                 if (!lowmem && !trim) {
 986                         return;
 987                 }
 988                 npgs_to_purge = seg_plocked_window >>
 989                     seg_pshrink_shift;
 990                 if (lowmem) {
 991                         npgs_to_purge = MIN(npgs_to_purge,
 992                             MAX(seg_pmaxapurge_npages, desfree));
 993                 } else {
 994                         npgs_to_purge = MIN(npgs_to_purge,
 995                             seg_pmaxapurge_npages);
 996                 }
 997                 if (npgs_to_purge == 0) {
 998                         return;
 999                 }
1000         } else {
1001                 struct seg_phash_wired *hpw;
1002
1003                 ASSERT(seg_phashsize_wired != 0);
1004
1005                 for (hpw = seg_phashtab_wired;
1006                     hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1007
1008                         if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1009                                 continue;
1010                         }
1011
1012                         mutex_enter(&hpw->p_hmutex);
1013
1014                         for (pcp = hpw->p_hnext;
1015                             pcp != (struct seg_pcache *)hpw;
1016                             pcp = pcp->p_hnext) {
1017
1018                                 ASSERT(IS_PCP_WIRED(pcp));
1019                                 ASSERT(pcp->p_hashp ==
1020                                     (struct seg_phash *)hpw);
1021
1022                                 if (pcp->p_active) {
1023                                         continue;
1024                                 }
1025                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1026                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1027                                 pcp->p_hprev = delcallb_list;
1028                                 delcallb_list = pcp;
1029                         }
1030                         mutex_exit(&hpw->p_hmutex);
1031                 }
1032         }
1033
1034         mutex_enter(&seg_pmem_mtx);
1035         if (seg_pathr_on) {
1036                 mutex_exit(&seg_pmem_mtx);
1037                 goto runcb;
1038         }
1039         seg_pathr_on = 1;
1040         mutex_exit(&seg_pmem_mtx);
1041         ASSERT(seg_pahcur <= 1);
1042         hlix = !seg_pahcur;
1043
1044 again:
1045         for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1046             hlinkp = hlnextp) {
1047
1048                 hlnextp = hlinkp->p_lnext;
1049                 ASSERT(hlnextp != NULL);
1050
1051                 hp = hlink2phash(hlinkp, hlix);
1052                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1053                         seg_pathr_empty_ahb++;
1054                         continue;
1055                 }
1056                 seg_pathr_full_ahb++;
1057                 mutex_enter(&hp->p_hmutex);
1058
1059                 for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1060                     pcp = pcp->p_hnext) {
1061                         pcache_link_t *pheadp;
1062                         pcache_link_t *plinkp;
1063                         void *htag0;
1064                         kmutex_t *pmtx;
1065
1066                         ASSERT(!IS_PCP_WIRED(pcp));
1067                         ASSERT(pcp->p_hashp == hp);
1068
1069                         if (pcp->p_active) {
1070                                 continue;
1071                         }
1072                         if (!force && pcp->p_ref &&
1073                             PCP_AGE(pcp) < seg_pmax_pcpage) {
1074                                 pcp->p_ref = 0;
1075                                 continue;
1076                         }
1077                         plinkp = &pcp->p_plink;
1078                         htag0 = pcp->p_htag0;
1079                         if (pcp->p_flags & SEGP_AMP) {
1080                                 pheadp = &((amp_t *)htag0)->a_phead;
1081                                 pmtx = &((amp_t *)htag0)->a_pmtx;
1082                         } else {
1083                                 pheadp = &((seg_t *)htag0)->s_phead;
1084                                 pmtx = &((seg_t *)htag0)->s_pmtx;
1085                         }
1086                         if (!mutex_tryenter(pmtx)) {
1087                                 continue;
1088                         }
1089                         ASSERT(pheadp->p_lnext != pheadp);
1090                         ASSERT(pheadp->p_lprev != pheadp);
1091                         plinkp->p_lprev->p_lnext =
1092                             plinkp->p_lnext;
1093                         plinkp->p_lnext->p_lprev =
1094                             plinkp->p_lprev;
1095                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1096                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1097                         mutex_exit(pmtx);
1098                         pcp->p_hprev = delcallb_list;
1099                         delcallb_list = pcp;
1100                         npgs_purged += btop(pcp->p_len);
1101                 }
1102                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1103                         seg_premove_abuck(hp, 1);
1104                 }
1105                 mutex_exit(&hp->p_hmutex);
1106                 if (npgs_purged >= seg_plocked_window) {
1107                         break;
1108                 }
1109                 if (!force) {
1110                         if (npgs_purged >= npgs_to_purge) {
1111                                 break;
1112                         }
1113                         if (!trim && !(seg_pathr_full_ahb & 15)) {
1114                                 ASSERT(lowmem);
1115                                 if (freemem >= lotsfree + needfree) {
1116                                         break;
1117                                 }
1118                         }
1119                 }
1120         }
1121
1122         if (hlinkp == &seg_pahhead[hlix]) {
1123                 /*
1124                  * We processed the entire hlix active bucket list
1125                  * but didn't find enough pages to reclaim.
1126                  * Switch the lists and walk the other list
1127                  * if we haven't done it yet.
1128                  */
1129                 mutex_enter(&seg_pmem_mtx);
1130                 ASSERT(seg_pathr_on);
1131                 ASSERT(seg_pahcur == !hlix);
1132                 seg_pahcur = hlix;
1133                 mutex_exit(&seg_pmem_mtx);
1134                 if (++hlinks < 2) {
1135                         hlix = !hlix;
1136                         goto again;
1137                 }
1138         } else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1139             seg_pahhead[hlix].p_lnext != hlinkp) {
1140                 ASSERT(hlinkp != NULL);
1141                 ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1142                 ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1143                 ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1144
1145                 /*
1146                  * Reinsert the header to point to hlinkp
1147                  * so that we start from hlinkp bucket next time around.
1148                  */
1149                 seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1150                 seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1151                 seg_pahhead[hlix].p_lnext = hlinkp;
1152                 seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1153                 hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1154                 hlinkp->p_lprev = &seg_pahhead[hlix];
1155         }
1156
1157         mutex_enter(&seg_pmem_mtx);
1158         ASSERT(seg_pathr_on);
1159         seg_pathr_on = 0;
1160         mutex_exit(&seg_pmem_mtx);
1161
1162 runcb:
1163         /*
1164          * Run the delayed callback list. segments/amps can't go away until
1165          * callback is executed since they must have non 0 softlockcnt. That's
1166          * why we don't need to hold as/seg/amp locks to execute the callback.
1167          */
1168         while (delcallb_list != NULL) {
1169                 pcp = delcallb_list;
1170                 delcallb_list = pcp->p_hprev;
1171                 ASSERT(!pcp->p_active);
1172                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1173                     pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1174                 npages += btop(pcp->p_len);
1175                 if (!IS_PCP_WIRED(pcp)) {
1176                         npages_window += btop(pcp->p_len);
1177                 }
1178                 kmem_cache_free(seg_pkmcache, pcp);
1179         }
1180         if (npages) {
1181                 mutex_enter(&seg_pmem_mtx);
1182                 ASSERT(seg_plocked >= npages);
1183                 ASSERT(seg_plocked_window >= npages_window);
1184                 seg_plocked -= npages;
1185                 seg_plocked_window -= npages_window;
1186                 mutex_exit(&seg_pmem_mtx);
1187         }
1188 }
1189
1190 /*
1191  * Remove cached pages for segment(s) entries from hashtable.  The segments
1192  * are identified by pp array. This is useful for multiple seg's cached on
1193  * behalf of dummy segment (ISM/DISM) with common pp array.
1194  */
1195 void
1196 seg_ppurge_wiredpp(struct page **pp)
1197 {
1198         struct seg_pcache *pcp;
1199         struct seg_phash_wired *hp;
1200         pgcnt_t npages = 0;
1201         struct  seg_pcache *delcallb_list = NULL;
1202
1203         /*
1204          * if the cache is empty, return
1205          */
1206         if (seg_plocked == 0) {
1207                 return;
1208         }
1209         ASSERT(seg_phashsize_wired != 0);
1210
1211         for (hp = seg_phashtab_wired;
1212             hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1213                 if (hp->p_hnext == (struct seg_pcache *)hp) {
1214                         continue;
1215                 }
1216                 mutex_enter(&hp->p_hmutex);
1217                 pcp = hp->p_hnext;
1218                 while (pcp != (struct seg_pcache *)hp) {
1219                         ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1220                         ASSERT(IS_PCP_WIRED(pcp));
1221                         /*
1222                          * purge entries which are not active
1223                          */
1224                         if (!pcp->p_active && pcp->p_pp == pp) {
1225                                 ASSERT(pcp->p_htag0 != NULL);
1226                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1227                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1228                                 pcp->p_hprev = delcallb_list;
1229                                 delcallb_list = pcp;
1230                         }
1231                         pcp = pcp->p_hnext;
1232                 }
1233                 mutex_exit(&hp->p_hmutex);
1234                 /*
1235                  * segments can't go away until callback is executed since
1236                  * they must have non 0 softlockcnt. That's why we don't
1237                  * need to hold as/seg locks to execute the callback.
1238                  */
1239                 while (delcallb_list != NULL) {
1240                         int done;
1241                         pcp = delcallb_list;
1242                         delcallb_list = pcp->p_hprev;
1243                         ASSERT(!pcp->p_active);
1244                         done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1245                             pcp->p_len, pcp->p_pp,
1246                             pcp->p_write ? S_WRITE : S_READ, 1);
1247                         npages += btop(pcp->p_len);
1248                         ASSERT(IS_PCP_WIRED(pcp));
1249                         kmem_cache_free(seg_pkmcache, pcp);
1250                         if (done) {
1251                                 ASSERT(delcallb_list == NULL);
1252                                 goto out;
1253                         }
1254                 }
1255         }
1256
1257 out:
1258         mutex_enter(&seg_pmem_mtx);
1259         ASSERT(seg_plocked >= npages);
1260         seg_plocked -= npages;
1261         mutex_exit(&seg_pmem_mtx);
1262 }
1263
1264 /*
1265  * purge all entries for a given segment. Since we
1266  * callback into the segment driver directly for page
1267  * reclaim the caller needs to hold the right locks.
1268  */
1269 void
1270 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1271 {
1272         struct seg_pcache *delcallb_list = NULL;
1273         struct seg_pcache *pcp;
1274         struct seg_phash *hp;
1275         pgcnt_t npages = 0;
1276         void *htag0;
1277
1278         if (seg_plocked == 0) {
1279                 return;
1280         }
1281         ASSERT(seg_phashsize_win != 0);
1282
1283         /*
1284          * If amp is not NULL use amp as a lookup tag otherwise use seg
1285          * as a lookup tag.
1286          */
1287         htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1288         ASSERT(htag0 != NULL);
1289         if (IS_PFLAGS_WIRED(flags)) {
1290                 hp = P_HASHBP(seg, htag0, 0, flags);
1291                 mutex_enter(&hp->p_hmutex);
1292                 pcp = hp->p_hnext;
1293                 while (pcp != (struct seg_pcache *)hp) {
1294                         ASSERT(pcp->p_hashp == hp);
1295                         ASSERT(IS_PCP_WIRED(pcp));
1296                         if (pcp->p_htag0 == htag0) {
1297                                 if (pcp->p_active) {
1298                                         break;
1299                                 }
1300                                 pcp->p_hprev->p_hnext = pcp->p_hnext;
1301                                 pcp->p_hnext->p_hprev = pcp->p_hprev;
1302                                 pcp->p_hprev = delcallb_list;
1303                                 delcallb_list = pcp;
1304                         }
1305                         pcp = pcp->p_hnext;
1306                 }
1307                 mutex_exit(&hp->p_hmutex);
1308         } else {
1309                 pcache_link_t *plinkp;
1310                 pcache_link_t *pheadp;
1311                 kmutex_t *pmtx;
1312
1313                 if (amp == NULL) {
1314                         ASSERT(seg != NULL);
1315                         pheadp = &seg->s_phead;
1316                         pmtx = &seg->s_pmtx;
1317                 } else {
1318                         pheadp = &amp->a_phead;
1319                         pmtx = &amp->a_pmtx;
1320                 }
1321                 mutex_enter(pmtx);
1322                 while ((plinkp = pheadp->p_lnext) != pheadp) {
1323                         pcp = plink2pcache(plinkp);
1324                         ASSERT(!IS_PCP_WIRED(pcp));
1325                         ASSERT(pcp->p_htag0 == htag0);
1326                         hp = pcp->p_hashp;
1327                         mutex_enter(&hp->p_hmutex);
1328                         if (pcp->p_active) {
1329                                 mutex_exit(&hp->p_hmutex);
1330                                 break;
1331                         }
1332                         ASSERT(plinkp->p_lprev == pheadp);
1333                         pheadp->p_lnext = plinkp->p_lnext;
1334                         plinkp->p_lnext->p_lprev = pheadp;
1335                         pcp->p_hprev->p_hnext = pcp->p_hnext;
1336                         pcp->p_hnext->p_hprev = pcp->p_hprev;
1337                         pcp->p_hprev = delcallb_list;
1338                         delcallb_list = pcp;
1339                         if (hp->p_hnext == (struct seg_pcache *)hp) {
1340                                 seg_premove_abuck(hp, 0);
1341                         }
1342                         mutex_exit(&hp->p_hmutex);
1343                 }
1344                 mutex_exit(pmtx);
1345         }
1346         while (delcallb_list != NULL) {
1347                 pcp = delcallb_list;
1348                 delcallb_list = pcp->p_hprev;
1349                 ASSERT(!pcp->p_active);
1350                 (void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1351                     pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1352                 npages += btop(pcp->p_len);
1353                 kmem_cache_free(seg_pkmcache, pcp);
1354         }
1355         mutex_enter(&seg_pmem_mtx);
1356         ASSERT(seg_plocked >= npages);
1357         seg_plocked -= npages;
1358         if (!IS_PFLAGS_WIRED(flags)) {
1359                 ASSERT(seg_plocked_window >= npages);
1360                 seg_plocked_window -= npages;
1361         }
1362         mutex_exit(&seg_pmem_mtx);
1363 }
1364
1365 static void seg_pinit_mem_config(void);
1366
1367 /*
1368  * setup the pagelock cache
1369  */
1370 static void
1371 seg_pinit(void)
1372 {
1373         struct seg_phash *hp;
1374         ulong_t i;
1375         pgcnt_t physmegs;
1376
1377         seg_plocked = 0;
1378         seg_plocked_window = 0;
1379
1380         if (segpcache_enabled == 0) {
1381                 seg_phashsize_win = 0;
1382                 seg_phashsize_wired = 0;
1383                 seg_pdisabled = 1;
1384                 return;
1385         }
1386
1387         seg_pdisabled = 0;
1388         seg_pkmcache = kmem_cache_create("seg_pcache",
1389             sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1390         if (segpcache_pcp_maxage_ticks <= 0) {
1391                 segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1392         }
1393         seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1394         seg_pathr_empty_ahb = 0;
1395         seg_pathr_full_ahb = 0;
1396         seg_pshrink_shift = segpcache_shrink_shift;
1397         seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1398
1399         mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1400         mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1401         mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1402         cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1403
1404         physmegs = physmem >> (20 - PAGESHIFT);
1405
1406         /*
1407          * If segpcache_hashsize_win was not set in /etc/system or it has
1408          * absurd value set it to a default.
1409          */
1410         if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1411                 /*
1412                  * Create one bucket per 32K (or at least per 8 pages) of
1413                  * available memory.
1414                  */
1415                 pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1416                 segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1417         }
1418         if (!ISP2(segpcache_hashsize_win)) {
1419                 ulong_t rndfac = ~(1UL <<
1420                     (highbit(segpcache_hashsize_win) - 1));
1421                 rndfac &= segpcache_hashsize_win;
1422                 segpcache_hashsize_win += rndfac;
1423                 segpcache_hashsize_win = 1 <<
1424                     (highbit(segpcache_hashsize_win) - 1);
1425         }
1426         seg_phashsize_win = segpcache_hashsize_win;
1427         seg_phashtab_win = kmem_zalloc(
1428             seg_phashsize_win * sizeof (struct seg_phash),
1429             KM_SLEEP);
1430         for (i = 0; i < seg_phashsize_win; i++) {
1431                 hp = &seg_phashtab_win[i];
1432                 hp->p_hnext = (struct seg_pcache *)hp;
1433                 hp->p_hprev = (struct seg_pcache *)hp;
1434                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1435         }
1436
1437         seg_pahcur = 0;
1438         seg_pathr_on = 0;
1439         seg_pahhead[0].p_lnext = &seg_pahhead[0];
1440         seg_pahhead[0].p_lprev = &seg_pahhead[0];
1441         seg_pahhead[1].p_lnext = &seg_pahhead[1];
1442         seg_pahhead[1].p_lprev = &seg_pahhead[1];
1443
1444         /*
1445          * If segpcache_hashsize_wired was not set in /etc/system or it has
1446          * absurd value set it to a default.
1447          */
1448         if (segpcache_hashsize_wired == 0 ||
1449             segpcache_hashsize_wired > physmem / 4) {
1450                 /*
1451                  * Choose segpcache_hashsize_wired based on physmem.
1452                  * Create a bucket per 128K bytes upto 256K buckets.
1453                  */
1454                 if (physmegs < 20 * 1024) {
1455                         segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1456                 } else {
1457                         segpcache_hashsize_wired = 256 * 1024;
1458                 }
1459         }
1460         if (!ISP2(segpcache_hashsize_wired)) {
1461                 segpcache_hashsize_wired = 1 <<
1462                     highbit(segpcache_hashsize_wired);
1463         }
1464         seg_phashsize_wired = segpcache_hashsize_wired;
1465         seg_phashtab_wired = kmem_zalloc(
1466             seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1467         for (i = 0; i < seg_phashsize_wired; i++) {
1468                 hp = (struct seg_phash *)&seg_phashtab_wired[i];
1469                 hp->p_hnext = (struct seg_pcache *)hp;
1470                 hp->p_hprev = (struct seg_pcache *)hp;
1471                 mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1472         }
1473
1474         if (segpcache_maxwindow == 0) {
1475                 if (physmegs < 64) {
1476                         /* 3% of memory */
1477                         segpcache_maxwindow = availrmem >> 5;
1478                 } else if (physmegs < 512) {
1479                         /* 12% of memory */
1480                         segpcache_maxwindow = availrmem >> 3;
1481                 } else if (physmegs < 1024) {
1482                         /* 25% of memory */
1483                         segpcache_maxwindow = availrmem >> 2;
1484                 } else if (physmegs < 2048) {
1485                         /* 50% of memory */
1486                         segpcache_maxwindow = availrmem >> 1;
1487                 } else {
1488                         /* no limit */
1489                         segpcache_maxwindow = (pgcnt_t)-1;
1490                 }
1491         }
1492         seg_pmaxwindow = segpcache_maxwindow;
1493         seg_pinit_mem_config();
1494 }
1495
1496 /*
1497  * called by pageout if memory is low
1498  */
1499 void
1500 seg_preap(void)
1501 {
1502         /*
1503          * if the cache is off or empty, return
1504          */
1505         if (seg_plocked_window == 0) {
1506                 return;
1507         }
1508         ASSERT(seg_phashsize_win != 0);
1509
1510         /*
1511          * If somebody is already purging pcache
1512          * just return.
1513          */
1514         if (seg_pdisabled) {
1515                 return;
1516         }
1517
1518         cv_signal(&seg_pasync_cv);
1519 }
1520
1521 /*
1522  * run as a backgroud thread and reclaim pagelock
1523  * pages which have not been used recently
1524  */
1525 void
1526 seg_pasync_thread(void)
1527 {
1528         callb_cpr_t cpr_info;
1529
1530         if (seg_phashsize_win == 0) {
1531                 thread_exit();
1532                 /*NOTREACHED*/
1533         }
1534
1535         seg_pasync_thr = curthread;
1536
1537         CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1538             callb_generic_cpr, "seg_pasync");
1539
1540         if (segpcache_reap_ticks <= 0) {
1541                 segpcache_reap_ticks = segpcache_reap_sec * hz;
1542         }
1543
1544         mutex_enter(&seg_pasync_mtx);
1545         for (;;) {
1546                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
1547                 (void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1548                     segpcache_reap_ticks, TR_CLOCK_TICK);
1549                 CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1550                 if (seg_pdisabled == 0) {
1551                         seg_ppurge_async(0);
1552                 }
1553         }
1554 }
1555
1556 static struct kmem_cache *seg_cache;
1557
1558 /*
1559  * Initialize segment management data structures.
1560  */
1561 void
1562 seg_init(void)
1563 {
1564         kstat_t *ksp;
1565
1566         seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1567             0, NULL, NULL, NULL, NULL, NULL, 0);
1568
1569         ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1570             segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1571         if (ksp) {
1572                 ksp->ks_data = (void *)segadvstat_ptr;
1573                 kstat_install(ksp);
1574         }
1575
1576         seg_pinit();
1577 }
1578
1579 /*
1580  * Allocate a segment to cover [base, base+size]
1581  * and attach it to the specified address space.
1582  */
1583 struct seg *
1584 seg_alloc(struct as *as, caddr_t base, size_t size)
1585 {
1586         struct seg *new;
1587         caddr_t segbase;
1588         size_t segsize;
1589
1590         segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1591         segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1592             (uintptr_t)segbase;
1593
1594         if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1595                 return (NULL);  /* bad virtual addr range */
1596
1597         if (as != &kas &&
1598             valid_usr_range(segbase, segsize, 0, as,
1599             as->a_userlimit) != RANGE_OKAY)
1600                 return (NULL);  /* bad virtual addr range */
1601
1602         new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1603         new->s_ops = NULL;
1604         new->s_data = NULL;
1605         new->s_szc = 0;
1606         new->s_flags = 0;
1607         mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1608         new->s_phead.p_lnext = &new->s_phead;
1609         new->s_phead.p_lprev = &new->s_phead;
1610         if (seg_attach(as, segbase, segsize, new) < 0) {
1611                 kmem_cache_free(seg_cache, new);
1612                 return (NULL);
1613         }
1614         /* caller must fill in ops, data */
1615         return (new);
1616 }
1617
1618 /*
1619  * Attach a segment to the address space.  Used by seg_alloc()
1620  * and for kernel startup to attach to static segments.
1621  */
1622 int
1623 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1624 {
1625         seg->s_as = as;
1626         seg->s_base = base;
1627         seg->s_size = size;
1628
1629         /*
1630          * as_addseg() will add the segment at the appropraite point
1631          * in the list. It will return -1 if there is overlap with
1632          * an already existing segment.
1633          */
1634         return (as_addseg(as, seg));
1635 }
1636
1637 /*
1638  * Unmap a segment and free it from its associated address space.
1639  * This should be called by anybody who's finished with a whole segment's
1640  * mapping.  Just calls segop_unmap() on the whole mapping .  It is the
1641  * responsibility of the segment driver to unlink the the segment
1642  * from the address space, and to free public and private data structures
1643  * associated with the segment.  (This is typically done by a call to
1644  * seg_free()).
1645  */
1646 void
1647 seg_unmap(struct seg *seg)
1648 {
1649 #ifdef DEBUG
1650         int ret;
1651 #endif /* DEBUG */
1652
1653         ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1654
1655         /* Shouldn't have called seg_unmap if mapping isn't yet established */
1656         ASSERT(seg->s_data != NULL);
1657
1658         /* Unmap the whole mapping */
1659 #ifdef DEBUG
1660         ret = segop_unmap(seg, seg->s_base, seg->s_size);
1661         ASSERT(ret == 0);
1662 #else
1663         (void) segop_unmap(seg, seg->s_base, seg->s_size);
1664 #endif /* DEBUG */
1665 }
1666
1667 /*
1668  * Free the segment from its associated as. This should only be called
1669  * if a mapping to the segment has not yet been established (e.g., if
1670  * an error occurs in the middle of doing an as_map when the segment
1671  * has already been partially set up) or if it has already been deleted
1672  * (e.g., from a segment driver unmap routine if the unmap applies to the
1673  * entire segment). If the mapping is currently set up then seg_unmap() should
1674  * be called instead.
1675  */
1676 void
1677 seg_free(struct seg *seg)
1678 {
1679         register struct as *as = seg->s_as;
1680         struct seg *tseg = as_removeseg(as, seg);
1681
1682         ASSERT(tseg == seg);
1683
1684         /*
1685          * If the segment private data field is NULL,
1686          * then segment driver is not attached yet.
1687          */
1688         if (seg->s_data != NULL)
1689                 segop_free(seg);
1690
1691         mutex_destroy(&seg->s_pmtx);
1692         ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1693         ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1694         kmem_cache_free(seg_cache, seg);
1695 }
1696
1697 /*ARGSUSED*/
1698 static void
1699 seg_p_mem_config_post_add(
1700         void *arg,
1701         pgcnt_t delta_pages)
1702 {
1703         /* Nothing to do. */
1704 }
1705
1706 void
1707 seg_p_enable(void)
1708 {
1709         mutex_enter(&seg_pcache_mtx);
1710         ASSERT(seg_pdisabled != 0);
1711         seg_pdisabled--;
1712         mutex_exit(&seg_pcache_mtx);
1713 }
1714
1715 /*
1716  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1717  * cache.
1718  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719  * SEGP_FAIL if the cache could not be emptied.
1720  */
1721 int
1722 seg_p_disable(void)
1723 {
1724         pgcnt_t old_plocked;
1725         int stall_count = 0;
1726
1727         mutex_enter(&seg_pcache_mtx);
1728         seg_pdisabled++;
1729         ASSERT(seg_pdisabled != 0);
1730         mutex_exit(&seg_pcache_mtx);
1731
1732         /*
1733          * Attempt to empty the cache. Terminate if seg_plocked does not
1734          * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1735          */
1736         while (seg_plocked != 0) {
1737                 ASSERT(seg_phashsize_win != 0);
1738                 old_plocked = seg_plocked;
1739                 seg_ppurge_async(1);
1740                 if (seg_plocked == old_plocked) {
1741                         if (stall_count++ > SEGP_STALL_THRESHOLD) {
1742                                 return (SEGP_FAIL);
1743                         }
1744                 } else
1745                         stall_count = 0;
1746                 if (seg_plocked != 0)
1747                         delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1748         }
1749         return (SEGP_SUCCESS);
1750 }
1751
1752 /*
1753  * Attempt to purge seg_pcache.  May need to return before this has
1754  * completed to allow other pre_del callbacks to unlock pages. This is
1755  * ok because:
1756  *      1) The seg_pdisabled flag has been set so at least we won't
1757  *      cache anymore locks and the locks we couldn't purge
1758  *      will not be held if they do get released by a subsequent
1759  *      pre-delete callback.
1760  *
1761  *      2) The rest of the memory delete thread processing does not
1762  *      depend on the changes made in this pre-delete callback. No
1763  *      panics will result, the worst that will happen is that the
1764  *      DR code will timeout and cancel the delete.
1765  */
1766 /*ARGSUSED*/
1767 static int
1768 seg_p_mem_config_pre_del(
1769         void *arg,
1770         pgcnt_t delta_pages)
1771 {
1772         if (seg_phashsize_win == 0) {
1773                 return (0);
1774         }
1775         if (seg_p_disable() != SEGP_SUCCESS)
1776                 cmn_err(CE_NOTE,
1777                     "!Pre-delete couldn't purge"" pagelock cache - continuing");
1778         return (0);
1779 }
1780
1781 /*ARGSUSED*/
1782 static void
1783 seg_p_mem_config_post_del(
1784         void *arg,
1785         pgcnt_t delta_pages,
1786         int cancelled)
1787 {
1788         if (seg_phashsize_win == 0) {
1789                 return;
1790         }
1791         seg_p_enable();
1792 }
1793
1794 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1795         KPHYSM_SETUP_VECTOR_VERSION,
1796         seg_p_mem_config_post_add,
1797         seg_p_mem_config_pre_del,
1798         seg_p_mem_config_post_del,
1799 };
1800
1801 static void
1802 seg_pinit_mem_config(void)
1803 {
1804         int ret;
1805
1806         ret = kphysm_setup_func_register(&seg_p_mem_config_vec, NULL);
1807         /*
1808          * Want to catch this in the debug kernel. At run time, if the
1809          * callbacks don't get run all will be OK as the disable just makes
1810          * it more likely that the pages can be collected.
1811          */
1812         ASSERT(ret == 0);
1813 }
1814
1815 /*
1816  * Verify that segment is not a shared anonymous segment which reserves
1817  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1818  * from one zone to another if any segments are shared.  This is because the
1819  * last process to exit will credit the swap reservation.  This could lead
1820  * to the swap being reserved by one zone, and credited to another.
1821  */
1822 boolean_t
1823 seg_can_change_zones(struct seg *seg)
1824 {
1825         struct segvn_data *svd;
1826
1827         if (seg->s_ops == &segspt_shmops)
1828                 return (B_FALSE);
1829
1830         if (seg->s_ops == &segvn_ops) {
1831                 svd = (struct segvn_data *)seg->s_data;
1832                 if (svd->type == MAP_SHARED &&
1833                     svd->amp != NULL &&
1834                     svd->amp->swresv > 0)
1835                         return (B_FALSE);
1836         }
1837         return (B_TRUE);
1838 }
1839
1840 /*
1841  * Return swap reserved by a segment backing a private mapping.
1842  */
1843 size_t
1844 seg_swresv(struct seg *seg)
1845 {
1846         struct segvn_data *svd;
1847         size_t swap = 0;
1848
1849         if (seg->s_ops == &segvn_ops) {
1850                 svd = (struct segvn_data *)seg->s_data;
1851                 if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1852                         swap = svd->swresv;
1853         }
1854         return (swap);
1855 }
1856
1857 /*
1858  * segop wrappers
1859  */
1860 int
1861 segop_dup(struct seg *seg, struct seg *new)
1862 {
1863         return (seg->s_ops->dup(seg, new));
1864 }
1865
1866 int
1867 segop_unmap(struct seg *seg, caddr_t addr, size_t len)
1868 {
1869         return (seg->s_ops->unmap(seg, addr, len));
1870 }
1871
1872 void
1873 segop_free(struct seg *seg)
1874 {
1875         seg->s_ops->free(seg);
1876 }
1877
1878 faultcode_t
1879 segop_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len,
1880     enum fault_type type, enum seg_rw rw)
1881 {
1882         return (seg->s_ops->fault(hat, seg, addr, len, type, rw));
1883 }
1884
1885 faultcode_t
1886 segop_faulta(struct seg *seg, caddr_t addr)
1887 {
1888         return (seg->s_ops->faulta(seg, addr));
1889 }
1890
1891 int
1892 segop_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1893 {
1894         return (seg->s_ops->setprot(seg, addr, len, prot));
1895 }
1896
1897 int
1898 segop_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1899 {
1900         return (seg->s_ops->checkprot(seg, addr, len, prot));
1901 }
1902
1903 int
1904 segop_kluster(struct seg *seg, caddr_t addr, ssize_t d)
1905 {
1906         return (seg->s_ops->kluster(seg, addr, d));
1907 }
1908
1909 int
1910 segop_sync(struct seg *seg, caddr_t addr, size_t len, int atr, uint_t f)
1911 {
1912         return (seg->s_ops->sync(seg, addr, len, atr, f));
1913 }
1914
1915 size_t
1916 segop_incore(struct seg *seg, caddr_t addr, size_t len, char *v)
1917 {
1918         return (seg->s_ops->incore(seg, addr, len, v));
1919 }
1920
1921 int
1922 segop_lockop(struct seg *seg, caddr_t addr, size_t len, int atr, int op,
1923     ulong_t *b, size_t p)
1924 {
1925         return (seg->s_ops->lockop(seg, addr, len, atr, op, b, p));
1926 }
1927
1928 int
1929 segop_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *p)
1930 {
1931         return (seg->s_ops->getprot(seg, addr, len, p));
1932 }
1933
1934 uoff_t
1935 segop_getoffset(struct seg *seg, caddr_t addr)
1936 {
1937         return (seg->s_ops->getoffset(seg, addr));
1938 }
1939
1940 int
1941 segop_gettype(struct seg *seg, caddr_t addr)
1942 {
1943         return (seg->s_ops->gettype(seg, addr));
1944 }
1945
1946 int
1947 segop_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
1948 {
1949         return (seg->s_ops->getvp(seg, addr, vpp));
1950 }
1951
1952 int
1953 segop_advise(struct seg *seg, caddr_t addr, size_t len, uint_t b)
1954 {
1955         return (seg->s_ops->advise(seg, addr, len, b));
1956 }
1957
1958 void
1959 segop_dump(struct seg *seg)
1960 {
1961         if (seg->s_ops->dump == NULL)
1962                 return;
1963
1964         seg->s_ops->dump(seg);
1965 }
1966
1967 int
1968 segop_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***page,
1969     enum lock_type type, enum seg_rw rw)
1970 {
1971         if (seg->s_ops->pagelock == NULL)
1972                 return (ENOTSUP);
1973
1974         return (seg->s_ops->pagelock(seg, addr, len, page, type, rw));
1975 }
1976
1977 int
1978 segop_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
1979 {
1980         if (seg->s_ops->setpagesize == NULL)
1981                 return (ENOTSUP);
1982
1983         return (seg->s_ops->setpagesize(seg, addr, len, szc));
1984 }
1985
1986 int
1987 segop_getmemid(struct seg *seg, caddr_t addr, memid_t *mp)
1988 {
1989         if (seg->s_ops->getmemid == NULL)
1990                 return (ENODEV);
1991
1992         return (seg->s_ops->getmemid(seg, addr, mp));
1993 }
1994
1995 struct lgrp_mem_policy_info *
1996 segop_getpolicy(struct seg *seg, caddr_t addr)
1997 {
1998         if (seg->s_ops->getpolicy == NULL)
1999                 return (NULL);
2000
2001         return (seg->s_ops->getpolicy(seg, addr));
2002 }
2003
2004 int
2005 segop_capable(struct seg *seg, segcapability_t cap)
2006 {
2007         if (seg->s_ops->capable == NULL)
2008                 return (0);
2009
2010         return (seg->s_ops->capable(seg, cap));
2011 }
2012
2013 int
2014 segop_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t op)
2015 {
2016         if (seg->s_ops->inherit == NULL)
2017                 return (ENOTSUP);
2018
2019         return (seg->s_ops->inherit(seg, addr, len, op));
2020 }