usr/src/uts/i86pc/os/pmem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27
  28 /*
  29  * PMEM - Direct mapping physical memory pages to userland process
  30  *
  31  * Provide functions used for directly (w/o occupying kernel virtual address
  32  * space) allocating and exporting physical memory pages to userland.
  33  */
  34
  35 #include <sys/types.h>
  36 #include <sys/mutex.h>
  37 #include <sys/sunddi.h>
  38 #include <sys/ddidevmap.h>
  39 #include <sys/vnode.h>
  40 #include <sys/sysmacros.h>
  41 #include <vm/seg_dev.h>
  42 #include <sys/pmem.h>
  43 #include <vm/hat_i86.h>
  44 #include <sys/task.h>
  45 #include <sys/sdt.h>
  46
  47 /*
  48  * The routines in this file allocate memory which will be accessed through
  49  * the AGP GART hardware.  The GART is programmed with the PFNs for this
  50  * memory, and the only mechanism for removing these entries is by an
  51  * explicit process operation (ioctl/close of the driver, or process exit).
  52  * As such, the pages need to remain locked to ensure that they won't be
  53  * relocated or paged out.
  54  *
  55  * To prevent these locked pages from getting in the way of page
  56  * coalescing, we try to allocate large pages from the system, and carve
  57  * them up to satisfy pmem allocation requests.  This will keep the locked
  58  * pages within a constrained area of physical memory, limiting the number
  59  * of large pages that would be pinned by our locked pages.  This is, of
  60  * course, another take on the infamous kernel cage, and it has many of the
  61  * downsides of the original cage.  It also interferes with system-wide
  62  * resource management decisions, as it maintains its own pool of unused
  63  * pages which can't be easily reclaimed and used during low-memory
  64  * situations.
  65  *
  66  * The right solution is for pmem to register a callback that the VM system
  67  * could call, which would temporarily remove any GART entries for pages
  68  * that were being relocated.  This would let us leave the pages unlocked,
  69  * which would remove the need for using large pages, which would simplify
  70  * this code a great deal.  Unfortunately, the support for these callbacks
  71  * only exists on some SPARC platforms right now.
  72  *
  73  * Note that this is the *only* reason that large pages are used here.  The
  74  * GART can't perform large-page translations, and the code appropriately
  75  * falls back to using small pages if page_create_va_large() fails.
  76  */
  77
  78 #define HOLD_DHP_LOCK(dhp)  if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
  79                         { mutex_enter(&dhp->dh_lock); }
  80
  81 #define RELE_DHP_LOCK(dhp) if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) \
  82                         { mutex_exit(&dhp->dh_lock); }
  83
  84 #define FROM_LPG(pp) (pp->p_szc != 0)
  85 #define PFIND(pp) (page_pptonum(pp) & (pmem_pgcnt - 1))
  86
  87 /*
  88  * Structs and static variables used for pmem only.
  89  */
  90 typedef struct pmem_lpg {
  91         page_t  *pl_pp;         /* start pp */
  92         ulong_t *pl_bitmap;     /* allocation status for each page */
  93         ushort_t pl_pfree;      /* this large page might be fully freed */
  94         struct pmem_lpg *pl_next;
  95         struct pmem_lpg *pl_prev;
  96 } pmem_lpg_t;
  97
  98 static size_t   pmem_lpgsize;   /* the size of one large page */
  99 static pgcnt_t  pmem_pgcnt;     /* the number of small pages in a large page */
 100 static uint_t   pmem_lszc;      /* page size code of the large page */
 101 /* The segment to be associated with all the allocated pages. */
 102 static struct seg       pmem_seg;
 103 /* Fully occupied large pages allocated for pmem. */
 104 static pmem_lpg_t *pmem_occ_lpgs;
 105 /* Memory pool to store residual small pages from large pages. */
 106 static page_t   *pmem_mpool = NULL;
 107 /* Number of small pages reside in pmem_mpool currently. */
 108 static pgcnt_t  pmem_nmpages = 0;
 109 /* To protect pmem_nmpages, pmem_mpool and pmem_occ_lpgs. */
 110 kmutex_t        pmem_mutex;
 111
 112 static int lpg_isfree(pmem_lpg_t *);
 113 static void pmem_lpg_sub(pmem_lpg_t **, pmem_lpg_t *);
 114 static void pmem_lpg_concat(pmem_lpg_t **, pmem_lpg_t **);
 115 static pmem_lpg_t *pmem_lpg_get(pmem_lpg_t *, page_t *, pmem_lpg_t **);
 116 static pmem_lpg_t *pmem_lpg_alloc(uint_t);
 117 static void pmem_lpg_free(pmem_lpg_t **, pmem_lpg_t *);
 118 static void lpg_free(page_t *spp);
 119 static pgcnt_t mpool_break(page_t **, pgcnt_t);
 120 static void mpool_append(page_t **, pgcnt_t);
 121 static void lpp_break(page_t **, pgcnt_t, pgcnt_t, pmem_lpg_t *);
 122 static void lpp_free(page_t *, pgcnt_t, pmem_lpg_t **);
 123 static int lpp_create(page_t **, pgcnt_t, pgcnt_t *, pmem_lpg_t **,
 124     vnode_t *, u_offset_t *, uint_t);
 125 static void tlist_in(page_t *, pgcnt_t, vnode_t *, u_offset_t *);
 126 static void tlist_out(page_t *, pgcnt_t);
 127 static int pmem_cookie_alloc(struct devmap_pmem_cookie **, pgcnt_t, uint_t);
 128 static int pmem_lock(pgcnt_t, proc_t *p);
 129
 130 /*
 131  * Called by driver devmap routine to pass physical memory mapping info to
 132  * seg_dev framework, used only for physical memory allocated from
 133  * devmap_pmem_alloc().
 134  */
 135 /* ARGSUSED */
 136 int
 137 devmap_pmem_setup(devmap_cookie_t dhc, dev_info_t *dip,
 138     struct devmap_callback_ctl *callbackops, devmap_pmem_cookie_t cookie,
 139     offset_t off, size_t len, uint_t maxprot, uint_t flags,
 140     ddi_device_acc_attr_t *accattrp)
 141 {
 142         devmap_handle_t *dhp = (devmap_handle_t *)dhc;
 143         struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
 144         uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
 145
 146         if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
 147                 return (DDI_FAILURE);
 148
 149         /*
 150          * First to check if this function has been called for this dhp.
 151          */
 152         if (dhp->dh_flags & DEVMAP_SETUP_DONE)
 153                 return (DDI_FAILURE);
 154
 155         if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
 156                 return (DDI_FAILURE);
 157
 158         /*
 159          * Check if the cache attributes are supported. Need to pay
 160          * attention that only uncachable or write-combining is
 161          * permitted for pmem.
 162          */
 163         if (i_ddi_check_cache_attr(flags) == B_FALSE ||
 164             (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
 165                 return (DDI_FAILURE);
 166
 167         if (flags & DEVMAP_MAPPING_INVALID) {
 168                 /*
 169                  * If DEVMAP_MAPPING_INVALID is specified, we have to grant
 170                  * remap permission.
 171                  */
 172                 if (!(flags & DEVMAP_ALLOW_REMAP))
 173                         return (DDI_FAILURE);
 174         } else {
 175                 dhp->dh_pcookie = (devmap_pmem_cookie_t)pcp;
 176                 /* dh_roff is the offset inside the dh_pcookie. */
 177                 dhp->dh_roff = ptob(btop(off));
 178                 /* Set the cache attributes correctly */
 179                 i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
 180         }
 181
 182         dhp->dh_cookie = DEVMAP_PMEM_COOKIE;
 183         dhp->dh_flags |= (flags & DEVMAP_SETUP_FLAGS);
 184         dhp->dh_len = ptob(btopr(len));
 185
 186         dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
 187         ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
 188
 189         if (callbackops != NULL) {
 190                 bcopy(callbackops, &dhp->dh_callbackops,
 191                     sizeof (struct devmap_callback_ctl));
 192         }
 193
 194         /*
 195          * Initialize dh_lock if we want to do remap.
 196          */
 197         if (dhp->dh_flags & DEVMAP_ALLOW_REMAP) {
 198                 mutex_init(&dhp->dh_lock, NULL, MUTEX_DEFAULT, NULL);
 199                 dhp->dh_flags |= DEVMAP_LOCK_INITED;
 200         }
 201
 202         dhp->dh_flags |= DEVMAP_SETUP_DONE;
 203
 204         return (DDI_SUCCESS);
 205 }
 206
 207 /*
 208  * Replace existing mapping using a new cookie, mainly gets called when doing
 209  * fork(). Should be called in associated devmap_dup(9E).
 210  */
 211 /* ARGSUSED */
 212 int
 213 devmap_pmem_remap(devmap_cookie_t dhc, dev_info_t *dip,
 214     devmap_pmem_cookie_t cookie, offset_t off, size_t len, uint_t maxprot,
 215     uint_t flags, ddi_device_acc_attr_t *accattrp)
 216 {
 217         devmap_handle_t *dhp = (devmap_handle_t *)dhc;
 218         struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
 219         uint_t cache_attr = IOMEM_CACHE_ATTR(flags);
 220
 221         /*
 222          * Reture failure if setup has not been done or no remap permission
 223          * has been granted during the setup.
 224          */
 225         if ((dhp->dh_flags & DEVMAP_SETUP_DONE) == 0 ||
 226             (dhp->dh_flags & DEVMAP_ALLOW_REMAP) == 0)
 227                 return (DDI_FAILURE);
 228
 229         /* No flags supported for remap yet. */
 230         if (flags != 0)
 231                 return (DDI_FAILURE);
 232
 233         if ((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) != dhp->dh_prot)
 234                 return (DDI_FAILURE);
 235
 236         if (pcp == NULL || (off + len) > ptob(pcp->dp_npages))
 237                 return (DDI_FAILURE);
 238
 239         /*
 240          * Check if the cache attributes are supported. Need to pay
 241          * attention that only uncachable or write-combining is
 242          * permitted for pmem.
 243          */
 244         if (i_ddi_check_cache_attr(flags) == B_FALSE ||
 245             (cache_attr & (IOMEM_DATA_UNCACHED|IOMEM_DATA_UC_WR_COMBINE)) == 0)
 246                 return (DDI_FAILURE);
 247
 248         HOLD_DHP_LOCK(dhp);
 249         /*
 250          * Unload the old mapping of pages reloated with this dhp, so next
 251          * fault will setup the new mappings. It is in segdev_faultpage that
 252          * calls hat_devload to establish the mapping. Do this while holding
 253          * the dhp lock so other faults dont reestablish the mappings.
 254          */
 255         hat_unload(dhp->dh_seg->s_as->a_hat, dhp->dh_uvaddr,
 256             dhp->dh_len, HAT_UNLOAD|HAT_UNLOAD_OTHER);
 257
 258         /* Set the cache attributes correctly */
 259         i_ddi_cacheattr_to_hatacc(cache_attr, &dhp->dh_hat_attr);
 260
 261         dhp->dh_pcookie = cookie;
 262         dhp->dh_roff = ptob(btop(off));
 263         dhp->dh_len = ptob(btopr(len));
 264
 265         /* Clear the large page size flag. */
 266         dhp->dh_flags &= ~DEVMAP_FLAG_LARGE;
 267
 268         dhp->dh_maxprot = maxprot & dhp->dh_orig_maxprot;
 269         ASSERT((dhp->dh_prot & dhp->dh_orig_maxprot & maxprot) == dhp->dh_prot);
 270         RELE_DHP_LOCK(dhp);
 271         return (DDI_SUCCESS);
 272 }
 273
 274 /*
 275  * Directly (i.e., without occupying kernel virtual address space) allocate
 276  * 'npages' physical memory pages for exporting to user land. The allocated
 277  * page_t pointer will be recorded in cookie.
 278  */
 279 int
 280 devmap_pmem_alloc(size_t size, uint_t flags, devmap_pmem_cookie_t *cookiep)
 281 {
 282         u_offset_t      pmem_off = 0;
 283         page_t          *pp = NULL;
 284         page_t          *lpp = NULL;
 285         page_t          *tlist = NULL;
 286         pgcnt_t         i = 0;
 287         pgcnt_t         rpages = 0;
 288         pgcnt_t         lpages = 0;
 289         pgcnt_t         tpages = 0;
 290         pgcnt_t         npages = btopr(size);
 291         pmem_lpg_t      *plp = NULL;
 292         struct devmap_pmem_cookie       *pcp;
 293         uint_t          reserved = 0;
 294         uint_t          locked = 0;
 295         uint_t          pflags, kflags;
 296
 297         *cookiep = NULL;
 298
 299         /*
 300          * Number larger than this will cause page_create_va() to loop
 301          * infinitely.
 302          */
 303         if (npages == 0 || npages >= total_pages / 2)
 304                 return (DDI_FAILURE);
 305         if ((flags & (PMEM_SLEEP | PMEM_NOSLEEP)) == 0)
 306                 return (DDI_FAILURE);
 307         pflags = flags & PMEM_NOSLEEP ? PG_EXCL : PG_WAIT;
 308         kflags = flags & PMEM_NOSLEEP ? KM_NOSLEEP : KM_SLEEP;
 309
 310         /* Allocate pmem cookie. */
 311         if (pmem_cookie_alloc(&pcp, npages, kflags) == DDI_FAILURE)
 312                 return (DDI_FAILURE);
 313         pcp->dp_npages = npages;
 314
 315         /*
 316          * See if the requested memory can be locked.
 317          */
 318         pcp->dp_proc = curproc;
 319         if (pmem_lock(npages, curproc) == DDI_FAILURE)
 320                 goto alloc_fail;
 321         locked = 1;
 322         /*
 323          * First, grab as many as possible from pmem_mpool. If pages in
 324          * pmem_mpool are enough for this request, we are done.
 325          */
 326         mutex_enter(&pmem_mutex);
 327         tpages = mpool_break(&tlist, npages);
 328         /* IOlock and hashin them into the new offset. */
 329         if (tpages)
 330                 tlist_in(tlist, tpages, pcp->dp_vnp, &pmem_off);
 331         mutex_exit(&pmem_mutex);
 332
 333         if (tpages == npages)
 334                 goto done;
 335
 336         rpages = npages - tpages;
 337         /* Quit now if memory cannot be reserved. */
 338         if (!page_resv(rpages, kflags))
 339                 goto alloc_fail;
 340         reserved = 1;
 341
 342         /* If we have large pages */
 343         if (pmem_lpgsize > PAGESIZE) {
 344                 /* Try to alloc large pages first to decrease fragmentation. */
 345                 i = (rpages + (pmem_pgcnt - 1)) / pmem_pgcnt;
 346                 if (lpp_create(&lpp, i, &lpages, &plp, pcp->dp_vnp, &pmem_off,
 347                     kflags) == DDI_FAILURE)
 348                         goto alloc_fail;
 349                 ASSERT(lpages == 0 ? lpp == NULL : 1);
 350         }
 351
 352         /*
 353          * Pages in large pages is more than the request, put the residual
 354          * pages into pmem_mpool.
 355          */
 356         if (lpages >= rpages) {
 357                 lpp_break(&lpp, lpages, lpages - rpages, plp);
 358                 goto done;
 359         }
 360
 361         /* Allocate small pages if lpp+tlist cannot satisfy the request. */
 362         i =  rpages - lpages;
 363         if ((pp = page_create_va(pcp->dp_vnp, pmem_off, ptob(i),
 364             pflags, &pmem_seg, (caddr_t)(uintptr_t)pmem_off)) == NULL)
 365                 goto alloc_fail;
 366
 367 done:
 368         page_list_concat(&tlist, &lpp);
 369         page_list_concat(&tlist, &pp);
 370         /* Set those small pages from large pages as allocated. */
 371         mutex_enter(&pmem_mutex);
 372         pmem_lpg_concat(&pmem_occ_lpgs, &plp);
 373         mutex_exit(&pmem_mutex);
 374
 375         /*
 376          * Now tlist holds all the pages for this cookie. Record these pages in
 377          * pmem cookie.
 378          */
 379         for (pp = tlist, i = 0; i < npages; i++) {
 380                 pcp->dp_pparray[i] = pp;
 381                 page_io_unlock(pp);
 382                 pp = pp->p_next;
 383                 page_sub(&tlist, pp->p_prev);
 384         }
 385         ASSERT(tlist == NULL);
 386         *cookiep = (devmap_pmem_cookie_t)pcp;
 387
 388         return (DDI_SUCCESS);
 389
 390 alloc_fail:
 391         DTRACE_PROBE(pmem__alloc__fail);
 392         /* Free large pages and the associated allocation records. */
 393         if (lpp)
 394                 lpp_free(lpp, lpages / pmem_pgcnt, &plp);
 395         if (reserved == 1)
 396                 page_unresv(rpages);
 397         /* Put those pages in tlist back into pmem_mpool. */
 398         if (tpages != 0) {
 399                 mutex_enter(&pmem_mutex);
 400                 /* IOunlock, hashout and update the allocation records. */
 401                 tlist_out(tlist, tpages);
 402                 mpool_append(&tlist, tpages);
 403                 mutex_exit(&pmem_mutex);
 404         }
 405         if (locked == 1)
 406                 i_ddi_decr_locked_memory(pcp->dp_proc, ptob(pcp->dp_npages));
 407         /* Freeing pmem_cookie. */
 408         kmem_free(pcp->dp_vnp, sizeof (vnode_t));
 409         kmem_free(pcp->dp_pparray, npages * sizeof (page_t *));
 410         kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
 411         return (DDI_FAILURE);
 412 }
 413
 414 /*
 415  * Free all small pages inside cookie, and return pages from large pages into
 416  * mpool, if all the pages from one large page is in mpool, free it as a whole.
 417  */
 418 void
 419 devmap_pmem_free(devmap_pmem_cookie_t cookie)
 420 {
 421         struct  devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
 422         pgcnt_t         i;
 423         pgcnt_t         tpages = 0;
 424         page_t          *pp;
 425         pmem_lpg_t      *pl1, *plp;
 426         pmem_lpg_t      *pf_lpgs = NULL;
 427         uint_t          npls = 0;
 428         pmem_lpg_t *last_pl = NULL;
 429         pmem_lpg_t *plast_pl = NULL;
 430
 431         ASSERT(pcp);
 432         mutex_enter(&pmem_mutex);
 433         /* Free small pages and return them to memory pool. */
 434         for (i = pcp->dp_npages; i > 0; i--) {
 435                 pp = pcp->dp_pparray[i - 1];
 436                 page_hashout(pp, NULL);
 437                 /*
 438                  * Remove the mapping of this single page, this mapping is
 439                  * created using hat_devload() in segdev_faultpage().
 440                  */
 441                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 442                 if (!FROM_LPG(pp)) {
 443                         /* Normal small page. */
 444                         page_free(pp, 1);
 445                         page_unresv(1);
 446                 } else {
 447                         /* Small page from large pages. */
 448                         plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
 449                         if (plp && !(plp->pl_pfree)) {
 450                                 /*
 451                                  * Move this record to pf_lpgs list, this large
 452                                  * page may be able to be freed as a whole.
 453                                  */
 454                                 pmem_lpg_sub(&pmem_occ_lpgs, plp);
 455                                 pmem_lpg_concat(&pf_lpgs, &plp);
 456                                 plp->pl_pfree = 1;
 457                                 npls++;
 458                                 last_pl = NULL;
 459                         } else {
 460                                 /* Search in pf_lpgs list. */
 461                                 plp = pmem_lpg_get(pf_lpgs, pp, &plast_pl);
 462                         }
 463                         ASSERT(plp);
 464                         /* Mark this page as free. */
 465                         BT_SET(plp->pl_bitmap, PFIND(pp));
 466                         /* Record this page in pmem_mpool. */
 467                         mpool_append(&pp, 1);
 468                 }
 469         }
 470
 471         /*
 472          * Find out the large pages whose pages have been freed, remove them
 473          * from plp list, free them and the associated pmem_lpg struct.
 474          */
 475         for (plp = pf_lpgs; npls != 0; npls--) {
 476                 pl1 = plp;
 477                 plp = plp->pl_next;
 478                 if (lpg_isfree(pl1)) {
 479                         /*
 480                          * Get one free large page.  Find all pages in this
 481                          * large page and remove them from pmem_mpool.
 482                          */
 483                         lpg_free(pl1->pl_pp);
 484                         /* Remove associated allocation records. */
 485                         pmem_lpg_sub(&pf_lpgs, pl1);
 486                         pmem_lpg_free(&pf_lpgs, pl1);
 487                         tpages -= pmem_pgcnt;
 488                 } else
 489                         pl1->pl_pfree = 0;
 490         }
 491         /* Update allocation records accordingly. */
 492         pmem_lpg_concat(&pmem_occ_lpgs, &pf_lpgs);
 493         mutex_exit(&pmem_mutex);
 494
 495         if (curproc == pcp->dp_proc)
 496                 i_ddi_decr_locked_memory(curproc, ptob(pcp->dp_npages));
 497         kmem_free(pcp->dp_vnp, sizeof (vnode_t));
 498         kmem_free(pcp->dp_pparray, pcp->dp_npages * sizeof (page_t *));
 499         kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
 500 }
 501
 502 /*
 503  * To extract page frame number from specified range in a cookie.
 504  */
 505 int
 506 devmap_pmem_getpfns(devmap_pmem_cookie_t cookie, uint_t start, pgcnt_t npages,
 507     pfn_t *pfnarray)
 508 {
 509         struct devmap_pmem_cookie *pcp = (struct devmap_pmem_cookie *)cookie;
 510         pgcnt_t i;
 511
 512         if (pcp == NULL || start + npages > pcp->dp_npages)
 513                 return (DDI_FAILURE);
 514
 515         for (i = start; i < start + npages; i++)
 516                 pfnarray[i - start] = pfn_to_mfn(pcp->dp_pparray[i]->p_pagenum);
 517
 518         return (DDI_SUCCESS);
 519 }
 520
 521 void
 522 pmem_init()
 523 {
 524         mutex_init(&pmem_mutex, NULL, MUTEX_DEFAULT, NULL);
 525         pmem_lszc = MIN(1, page_num_pagesizes() - 1);
 526         pmem_lpgsize = page_get_pagesize(pmem_lszc);
 527         pmem_pgcnt = pmem_lpgsize >> PAGESHIFT;
 528         bzero(&pmem_seg, sizeof (struct seg));
 529         pmem_seg.s_as = &kas;
 530 }
 531
 532 /* Allocate kernel memory for one pmem cookie with n pages. */
 533 static int
 534 pmem_cookie_alloc(struct devmap_pmem_cookie **pcpp, pgcnt_t n, uint_t kflags)
 535 {
 536         struct devmap_pmem_cookie *pcp;
 537
 538         if ((*pcpp = kmem_zalloc(sizeof (struct devmap_pmem_cookie),
 539             kflags)) == NULL)
 540                 return (DDI_FAILURE);
 541         pcp = *pcpp;
 542         if ((pcp->dp_vnp =
 543             kmem_zalloc(sizeof (vnode_t), kflags)) == NULL) {
 544                 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
 545                 return (DDI_FAILURE);
 546         }
 547         if ((pcp->dp_pparray =
 548             kmem_zalloc(n * sizeof (page_t *), kflags)) == NULL) {
 549                 kmem_free(pcp->dp_vnp, sizeof (vnode_t));
 550                 kmem_free(pcp, sizeof (struct devmap_pmem_cookie));
 551                 return (DDI_FAILURE);
 552         }
 553         return (DDI_SUCCESS);
 554 }
 555
 556 /* Try to lock down n pages resource */
 557 static int
 558 pmem_lock(pgcnt_t n, proc_t *p)
 559 {
 560         if (i_ddi_incr_locked_memory(p, ptob(n)) != 0) {
 561                 return (DDI_FAILURE);
 562         }
 563         return (DDI_SUCCESS);
 564 }
 565
 566 /* To check if all the pages in a large page are freed. */
 567 static int
 568 lpg_isfree(pmem_lpg_t *plp)
 569 {
 570         uint_t i;
 571
 572         for (i = 0; i < BT_BITOUL(pmem_pgcnt); i++)
 573                 if (plp->pl_bitmap[i] != BT_ULMAXMASK)
 574                         return (0);
 575         /* All 1 means all pages are freed. */
 576         return (1);
 577 }
 578
 579 /*
 580  * Using pp to get the associated large page allocation record, searching in
 581  * the splp linked list with *last as the heuristic pointer. Return NULL if
 582  * not found.
 583  */
 584 static pmem_lpg_t *
 585 pmem_lpg_get(pmem_lpg_t *splp, page_t *pp, pmem_lpg_t **last)
 586 {
 587         pmem_lpg_t *plp;
 588         pgcnt_t root_pfn;
 589
 590         ASSERT(pp);
 591         if (splp == NULL)
 592                 return (NULL);
 593         root_pfn = page_pptonum(pp) & ~(pmem_pgcnt - 1);
 594
 595         /* Try last winner first. */
 596         if (*last && root_pfn == page_pptonum((*last)->pl_pp))
 597                 goto pl_found;
 598
 599         /* Else search the whole pmem_lpg list. */
 600         for (plp = splp; root_pfn != page_pptonum(plp->pl_pp); ) {
 601                 plp = plp->pl_next;
 602                 if (plp == splp) {
 603                         plp = NULL;
 604                         break;
 605                 }
 606                 ASSERT(plp->pl_pp);
 607         }
 608
 609         *last = plp;
 610
 611 pl_found:
 612         return (*last);
 613 }
 614
 615 /*
 616  *  Remove one pmem_lpg plp from the oplpp list.
 617  */
 618 static void
 619 pmem_lpg_sub(pmem_lpg_t **oplpp, pmem_lpg_t *plp)
 620 {
 621         if (*oplpp == plp)
 622                 *oplpp = plp->pl_next;          /* go to next pmem_lpg */
 623
 624         if (*oplpp == plp)
 625                 *oplpp = NULL;                  /* pmem_lpg list is gone */
 626         else {
 627                 plp->pl_prev->pl_next = plp->pl_next;
 628                 plp->pl_next->pl_prev = plp->pl_prev;
 629         }
 630         plp->pl_prev = plp->pl_next = plp;      /* make plp a list of one */
 631 }
 632
 633 /*
 634  * Concatenate page list nplpp onto the end of list plpp.
 635  */
 636 static void
 637 pmem_lpg_concat(pmem_lpg_t **plpp, pmem_lpg_t **nplpp)
 638 {
 639         pmem_lpg_t *s1p, *s2p, *e1p, *e2p;
 640
 641         if (*nplpp == NULL) {
 642                 return;
 643         }
 644         if (*plpp == NULL) {
 645                 *plpp = *nplpp;
 646                 return;
 647         }
 648         s1p = *plpp;
 649         e1p =  s1p->pl_prev;
 650         s2p = *nplpp;
 651         e2p = s2p->pl_prev;
 652         s1p->pl_prev = e2p;
 653         e2p->pl_next = s1p;
 654         e1p->pl_next = s2p;
 655         s2p->pl_prev = e1p;
 656 }
 657
 658 /*
 659  * Allocate and initialize the allocation record of one large page, the init
 660  * value is 'allocated'.
 661  */
 662 static pmem_lpg_t *
 663 pmem_lpg_alloc(uint_t kflags)
 664 {
 665         pmem_lpg_t *plp;
 666
 667         ASSERT(pmem_pgcnt % BT_NBIPUL == 0);
 668         plp = kmem_zalloc(sizeof (pmem_lpg_t), kflags);
 669         if (plp == NULL)
 670                 return (NULL);
 671         plp->pl_bitmap = kmem_zalloc(BT_SIZEOFMAP(pmem_pgcnt), kflags);
 672         if (plp->pl_bitmap == NULL) {
 673                 kmem_free(plp, sizeof (*plp));
 674                 return (NULL);
 675         }
 676         plp->pl_next = plp->pl_prev = plp;
 677         return (plp);
 678 }
 679
 680 /* Free one allocation record pointed by oplp. */
 681 static void
 682 pmem_lpg_free(pmem_lpg_t **headp, pmem_lpg_t *plp)
 683 {
 684         if (*headp == plp)
 685                 *headp = plp->pl_next;          /* go to next pmem_lpg_t */
 686
 687         if (*headp == plp)
 688                 *headp = NULL;                  /* this list is gone */
 689         else {
 690                 plp->pl_prev->pl_next = plp->pl_next;
 691                 plp->pl_next->pl_prev = plp->pl_prev;
 692         }
 693         kmem_free(plp->pl_bitmap, BT_SIZEOFMAP(pmem_pgcnt));
 694         kmem_free(plp, sizeof (*plp));
 695 }
 696
 697 /* Free one large page headed by spp from pmem_mpool. */
 698 static void
 699 lpg_free(page_t *spp)
 700 {
 701         page_t *pp1 = spp;
 702         uint_t i;
 703
 704         ASSERT(MUTEX_HELD(&pmem_mutex));
 705         for (i = 0; i < pmem_pgcnt; i++) {
 706                 /* Break pp1 from pmem_mpool. */
 707                 page_sub(&pmem_mpool, pp1);
 708                 pp1++;
 709         }
 710         /* Free pages in this large page. */
 711         page_free_pages(spp);
 712         page_unresv(pmem_pgcnt);
 713         pmem_nmpages -= pmem_pgcnt;
 714         ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
 715 }
 716
 717 /* Put n pages in *ppp list back into pmem_mpool. */
 718 static void
 719 mpool_append(page_t **ppp, pgcnt_t n)
 720 {
 721         ASSERT(MUTEX_HELD(&pmem_mutex));
 722         /* Put back pages. */
 723         page_list_concat(&pmem_mpool, ppp);
 724         pmem_nmpages += n;
 725         ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
 726 }
 727
 728 /*
 729  * Try to grab MIN(pmem_nmpages, n) pages from pmem_mpool, put them into *ppp
 730  * list, and return the number of grabbed pages.
 731  */
 732 static pgcnt_t
 733 mpool_break(page_t **ppp, pgcnt_t n)
 734 {
 735         pgcnt_t i;
 736
 737         ASSERT(MUTEX_HELD(&pmem_mutex));
 738         /* Grab the pages. */
 739         i = MIN(pmem_nmpages, n);
 740         *ppp = pmem_mpool;
 741         page_list_break(ppp, &pmem_mpool, i);
 742         pmem_nmpages -= i;
 743         ASSERT((pmem_nmpages && pmem_mpool) || (!pmem_nmpages && !pmem_mpool));
 744         return (i);
 745 }
 746
 747 /*
 748  * Create n large pages, lpages and plpp contains the number of small pages and
 749  * allocation records list respectively.
 750  */
 751 static int
 752 lpp_create(page_t **lppp, pgcnt_t n, pgcnt_t *lpages, pmem_lpg_t **plpp,
 753     vnode_t *vnp, u_offset_t *offp, uint_t kflags)
 754 {
 755         pgcnt_t i;
 756         pmem_lpg_t *plp;
 757         page_t *pp;
 758
 759         for (i = 0, *lpages = 0; i < n; i++) {
 760                 /* Allocte one large page each time. */
 761                 pp = page_create_va_large(vnp, *offp, pmem_lpgsize,
 762                     PG_EXCL, &pmem_seg, (caddr_t)(uintptr_t)*offp, NULL);
 763                 if (pp == NULL)
 764                         break;
 765                 *offp += pmem_lpgsize;
 766                 page_list_concat(lppp, &pp);
 767                 *lpages += pmem_pgcnt;
 768                 /* Add one allocation record for this large page. */
 769                 if ((plp = pmem_lpg_alloc(kflags)) == NULL)
 770                         return (DDI_FAILURE);
 771                 plp->pl_pp = pp;
 772                 pmem_lpg_concat(plpp, &plp);
 773         }
 774         return (DDI_SUCCESS);
 775 }
 776
 777 /*
 778  * Break the last r small pages from the large page list *lppp (with totally n
 779  * small pages) and put them into pmem_mpool.
 780  */
 781 static void
 782 lpp_break(page_t **lppp, pgcnt_t n, pgcnt_t r, pmem_lpg_t *oplp)
 783 {
 784         page_t *pp, *pp1;
 785         pgcnt_t i;
 786         pmem_lpg_t *plp;
 787
 788         if (r == 0)
 789                 return;
 790         ASSERT(*lppp != NULL && r < pmem_pgcnt);
 791         page_list_break(lppp, &pp, n - r);
 792
 793         /* The residual should reside in the last large page.  */
 794         plp = oplp->pl_prev;
 795         /* IOunlock and hashout the residual pages. */
 796         for (pp1 = pp, i = 0; i < r; i++) {
 797                 page_io_unlock(pp1);
 798                 page_hashout(pp1, NULL);
 799                 /* Mark this page as free. */
 800                 BT_SET(plp->pl_bitmap, PFIND(pp1));
 801                 pp1 = pp1->p_next;
 802         }
 803         ASSERT(pp1 == pp);
 804         /* Put these residual pages into memory pool. */
 805         mutex_enter(&pmem_mutex);
 806         mpool_append(&pp, r);
 807         mutex_exit(&pmem_mutex);
 808 }
 809
 810 /* Freeing large pages in lpp and the associated allocation records in plp. */
 811 static void
 812 lpp_free(page_t *lpp, pgcnt_t lpgs, pmem_lpg_t **plpp)
 813 {
 814         pgcnt_t i, j;
 815         page_t *pp = lpp, *pp1;
 816         pmem_lpg_t *plp1, *plp2;
 817
 818         for (i = 0; i < lpgs; i++) {
 819                 for (j = 0; j < pmem_pgcnt; j++) {
 820                         /* IO unlock and hashout this small page. */
 821                         page_io_unlock(pp);
 822                         page_hashout(pp, NULL);
 823                         pp1 = pp->p_next;
 824                         pp->p_prev = pp->p_next = pp;
 825                         pp = pp1;
 826                 }
 827                 /* Free one large page at one time. */
 828                 page_free_pages(lpp);
 829                 lpp = pp;
 830         }
 831         /* Free associate pmem large page allocation records. */
 832         for (plp1 = *plpp; *plpp; plp1 = plp2) {
 833                 plp2 = plp1->pl_next;
 834                 pmem_lpg_free(plpp, plp1);
 835         }
 836 }
 837
 838 /*
 839  * IOlock and hashin all pages in tlist, associate them with vnode *pvnp
 840  * and offset starting with *poffp. Update allocation records accordingly at
 841  * the same time.
 842  */
 843 static void
 844 tlist_in(page_t *tlist, pgcnt_t tpages, vnode_t *pvnp, u_offset_t *poffp)
 845 {
 846         page_t *pp;
 847         pgcnt_t i = 0;
 848         pmem_lpg_t *plp, *last_pl = NULL;
 849
 850         ASSERT(MUTEX_HELD(&pmem_mutex));
 851         for (pp = tlist; i < tpages; i++) {
 852                 ASSERT(FROM_LPG(pp));
 853                 page_io_lock(pp);
 854                 (void) page_hashin(pp, pvnp, *poffp, NULL);
 855                 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
 856                 /* Mark this page as allocated. */
 857                 BT_CLEAR(plp->pl_bitmap, PFIND(pp));
 858                 *poffp += PAGESIZE;
 859                 pp = pp->p_next;
 860         }
 861         ASSERT(pp == tlist);
 862 }
 863
 864 /*
 865  * IOunlock and hashout all pages in tlist, update allocation records
 866  * accordingly at the same time.
 867  */
 868 static void
 869 tlist_out(page_t *tlist, pgcnt_t tpages)
 870 {
 871         page_t *pp;
 872         pgcnt_t i = 0;
 873         pmem_lpg_t *plp, *last_pl = NULL;
 874
 875         ASSERT(MUTEX_HELD(&pmem_mutex));
 876         for (pp = tlist; i < tpages; i++) {
 877                 ASSERT(FROM_LPG(pp));
 878                 page_io_unlock(pp);
 879                 page_hashout(pp, NULL);
 880                 plp = pmem_lpg_get(pmem_occ_lpgs, pp, &last_pl);
 881                 /* Mark this page as free. */
 882                 BT_SET(plp->pl_bitmap, PFIND(pp));
 883                 pp = pp->p_next;
 884         }
 885         ASSERT(pp == tlist);
 886 }