mm/zbud.c

   1 /*
   2  * zbud.c
   3  *
   4  * Copyright (C) 2013, Seth Jennings, IBM
   5  *
   6  * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
   7  *
   8  * zbud is an special purpose allocator for storing compressed pages.  Contrary
   9  * to what its name may suggest, zbud is not a buddy allocator, but rather an
  10  * allocator that "buddies" two compressed pages together in a single memory
  11  * page.
  12  *
  13  * While this design limits storage density, it has simple and deterministic
  14  * reclaim properties that make it preferable to a higher density approach when
  15  * reclaim will be used.
  16  *
  17  * zbud works by storing compressed pages, or "zpages", together in pairs in a
  18  * single memory page called a "zbud page".  The first buddy is "left
  19  * justifed" at the beginning of the zbud page, and the last buddy is "right
  20  * justified" at the end of the zbud page.  The benefit is that if either
  21  * buddy is freed, the freed buddy space, coalesced with whatever slack space
  22  * that existed between the buddies, results in the largest possible free region
  23  * within the zbud page.
  24  *
  25  * zbud also provides an attractive lower bound on density. The ratio of zpages
  26  * to zbud pages can not be less than 1.  This ensures that zbud can never "do
  27  * harm" by using more pages to store zpages than the uncompressed zpages would
  28  * have used on their own.
  29  *
  30  * zbud pages are divided into "chunks".  The size of the chunks is fixed at
  31  * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
  32  * into chunks allows organizing unbuddied zbud pages into a manageable number
  33  * of unbuddied lists according to the number of free chunks available in the
  34  * zbud page.
  35  *
  36  * The zbud API differs from that of conventional allocators in that the
  37  * allocation function, zbud_alloc(), returns an opaque handle to the user,
  38  * not a dereferenceable pointer.  The user must map the handle using
  39  * zbud_map() in order to get a usable pointer by which to access the
  40  * allocation data and unmap the handle with zbud_unmap() when operations
  41  * on the allocation data are complete.
  42  */
  43
  44 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  45
  46 #include <linux/atomic.h>
  47 #include <linux/list.h>
  48 #include <linux/mm.h>
  49 #include <linux/module.h>
  50 #include <linux/preempt.h>
  51 #include <linux/slab.h>
  52 #include <linux/spinlock.h>
  53 #include <linux/zbud.h>
  54
  55 /*****************
  56  * Structures
  57 *****************/
  58 /*
  59  * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  60  * adjusting internal fragmentation.  It also determines the number of
  61  * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  62  * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
  63  * will be 64 freelists per pool.
  64  */
  65 #define NCHUNKS_ORDER   6
  66
  67 #define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
  68 #define CHUNK_SIZE      (1 << CHUNK_SHIFT)
  69 #define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
  70 #define ZHDR_SIZE_ALIGNED CHUNK_SIZE
  71
  72 /**
  73  * struct zbud_pool - stores metadata for each zbud pool
  74  * @lock:       protects all pool fields and first|last_chunk fields of any
  75  *              zbud page in the pool
  76  * @unbuddied:  array of lists tracking zbud pages that only contain one buddy;
  77  *              the lists each zbud page is added to depends on the size of
  78  *              its free region.
  79  * @buddied:    list tracking the zbud pages that contain two buddies;
  80  *              these zbud pages are full
  81  * @lru:        list tracking the zbud pages in LRU order by most recently
  82  *              added buddy.
  83  * @pages_nr:   number of zbud pages in the pool.
  84  * @ops:        pointer to a structure of user defined operations specified at
  85  *              pool creation time.
  86  *
  87  * This structure is allocated at pool creation time and maintains metadata
  88  * pertaining to a particular zbud pool.
  89  */
  90 struct zbud_pool {
  91         spinlock_t lock;
  92         struct list_head unbuddied[NCHUNKS];
  93         struct list_head buddied;
  94         struct list_head lru;
  95         u64 pages_nr;
  96         struct zbud_ops *ops;
  97 };
  98
  99 /*
 100  * struct zbud_header - zbud page metadata occupying the first chunk of each
 101  *                      zbud page.
 102  * @buddy:      links the zbud page into the unbuddied/buddied lists in the pool
 103  * @lru:        links the zbud page into the lru list in the pool
 104  * @first_chunks:       the size of the first buddy in chunks, 0 if free
 105  * @last_chunks:        the size of the last buddy in chunks, 0 if free
 106  */
 107 struct zbud_header {
 108         struct list_head buddy;
 109         struct list_head lru;
 110         unsigned int first_chunks;
 111         unsigned int last_chunks;
 112         bool under_reclaim;
 113 };
 114
 115 /*****************
 116  * Helpers
 117 *****************/
 118 /* Just to make the code easier to read */
 119 enum buddy {
 120         FIRST,
 121         LAST
 122 };
 123
 124 /* Converts an allocation size in bytes to size in zbud chunks */
 125 static int size_to_chunks(int size)
 126 {
 127         return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 128 }
 129
 130 #define for_each_unbuddied_list(_iter, _begin) \
 131         for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
 132
 133 /* Initializes the zbud header of a newly allocated zbud page */
 134 static struct zbud_header *init_zbud_page(struct page *page)
 135 {
 136         struct zbud_header *zhdr = page_address(page);
 137         zhdr->first_chunks = 0;
 138         zhdr->last_chunks = 0;
 139         INIT_LIST_HEAD(&zhdr->buddy);
 140         INIT_LIST_HEAD(&zhdr->lru);
 141         zhdr->under_reclaim = 0;
 142         return zhdr;
 143 }
 144
 145 /* Resets the struct page fields and frees the page */
 146 static void free_zbud_page(struct zbud_header *zhdr)
 147 {
 148         __free_page(virt_to_page(zhdr));
 149 }
 150
 151 /*
 152  * Encodes the handle of a particular buddy within a zbud page
 153  * Pool lock should be held as this function accesses first|last_chunks
 154  */
 155 static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
 156 {
 157         unsigned long handle;
 158
 159         /*
 160          * For now, the encoded handle is actually just the pointer to the data
 161          * but this might not always be the case.  A little information hiding.
 162          * Add CHUNK_SIZE to the handle if it is the first allocation to jump
 163          * over the zbud header in the first chunk.
 164          */
 165         handle = (unsigned long)zhdr;
 166         if (bud == FIRST)
 167                 /* skip over zbud header */
 168                 handle += ZHDR_SIZE_ALIGNED;
 169         else /* bud == LAST */
 170                 handle += PAGE_SIZE - (zhdr->last_chunks  << CHUNK_SHIFT);
 171         return handle;
 172 }
 173
 174 /* Returns the zbud page where a given handle is stored */
 175 static struct zbud_header *handle_to_zbud_header(unsigned long handle)
 176 {
 177         return (struct zbud_header *)(handle & PAGE_MASK);
 178 }
 179
 180 /* Returns the number of free chunks in a zbud page */
 181 static int num_free_chunks(struct zbud_header *zhdr)
 182 {
 183         /*
 184          * Rather than branch for different situations, just use the fact that
 185          * free buddies have a length of zero to simplify everything. -1 at the
 186          * end for the zbud header.
 187          */
 188         return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
 189 }
 190
 191 /*****************
 192  * API Functions
 193 *****************/
 194 /**
 195  * zbud_create_pool() - create a new zbud pool
 196  * @gfp:        gfp flags when allocating the zbud pool structure
 197  * @ops:        user-defined operations for the zbud pool
 198  *
 199  * Return: pointer to the new zbud pool or NULL if the metadata allocation
 200  * failed.
 201  */
 202 struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
 203 {
 204         struct zbud_pool *pool;
 205         int i;
 206
 207         pool = kmalloc(sizeof(struct zbud_pool), gfp);
 208         if (!pool)
 209                 return NULL;
 210         spin_lock_init(&pool->lock);
 211         for_each_unbuddied_list(i, 0)
 212                 INIT_LIST_HEAD(&pool->unbuddied[i]);
 213         INIT_LIST_HEAD(&pool->buddied);
 214         INIT_LIST_HEAD(&pool->lru);
 215         pool->pages_nr = 0;
 216         pool->ops = ops;
 217         return pool;
 218 }
 219
 220 /**
 221  * zbud_destroy_pool() - destroys an existing zbud pool
 222  * @pool:       the zbud pool to be destroyed
 223  *
 224  * The pool should be emptied before this function is called.
 225  */
 226 void zbud_destroy_pool(struct zbud_pool *pool)
 227 {
 228         kfree(pool);
 229 }
 230
 231 /**
 232  * zbud_alloc() - allocates a region of a given size
 233  * @pool:       zbud pool from which to allocate
 234  * @size:       size in bytes of the desired allocation
 235  * @gfp:        gfp flags used if the pool needs to grow
 236  * @handle:     handle of the new allocation
 237  *
 238  * This function will attempt to find a free region in the pool large enough to
 239  * satisfy the allocation request.  A search of the unbuddied lists is
 240  * performed first. If no suitable free region is found, then a new page is
 241  * allocated and added to the pool to satisfy the request.
 242  *
 243  * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
 244  * as zbud pool pages.
 245  *
 246  * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
 247  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
 248  * a new page.
 249  */
 250 int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
 251                         unsigned long *handle)
 252 {
 253         int chunks, i, freechunks;
 254         struct zbud_header *zhdr = NULL;
 255         enum buddy bud;
 256         struct page *page;
 257
 258         if (size <= 0 || gfp & __GFP_HIGHMEM)
 259                 return -EINVAL;
 260         if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
 261                 return -ENOSPC;
 262         chunks = size_to_chunks(size);
 263         spin_lock(&pool->lock);
 264
 265         /* First, try to find an unbuddied zbud page. */
 266         zhdr = NULL;
 267         for_each_unbuddied_list(i, chunks) {
 268                 if (!list_empty(&pool->unbuddied[i])) {
 269                         zhdr = list_first_entry(&pool->unbuddied[i],
 270                                         struct zbud_header, buddy);
 271                         list_del(&zhdr->buddy);
 272                         if (zhdr->first_chunks == 0)
 273                                 bud = FIRST;
 274                         else
 275                                 bud = LAST;
 276                         goto found;
 277                 }
 278         }
 279
 280         /* Couldn't find unbuddied zbud page, create new one */
 281         spin_unlock(&pool->lock);
 282         page = alloc_page(gfp);
 283         if (!page)
 284                 return -ENOMEM;
 285         spin_lock(&pool->lock);
 286         pool->pages_nr++;
 287         zhdr = init_zbud_page(page);
 288         bud = FIRST;
 289
 290 found:
 291         if (bud == FIRST)
 292                 zhdr->first_chunks = chunks;
 293         else
 294                 zhdr->last_chunks = chunks;
 295
 296         if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
 297                 /* Add to unbuddied list */
 298                 freechunks = num_free_chunks(zhdr);
 299                 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
 300         } else {
 301                 /* Add to buddied list */
 302                 list_add(&zhdr->buddy, &pool->buddied);
 303         }
 304
 305         /* Add/move zbud page to beginning of LRU */
 306         if (!list_empty(&zhdr->lru))
 307                 list_del(&zhdr->lru);
 308         list_add(&zhdr->lru, &pool->lru);
 309
 310         *handle = encode_handle(zhdr, bud);
 311         spin_unlock(&pool->lock);
 312
 313         return 0;
 314 }
 315
 316 /**
 317  * zbud_free() - frees the allocation associated with the given handle
 318  * @pool:       pool in which the allocation resided
 319  * @handle:     handle associated with the allocation returned by zbud_alloc()
 320  *
 321  * In the case that the zbud page in which the allocation resides is under
 322  * reclaim, as indicated by the PG_reclaim flag being set, this function
 323  * only sets the first|last_chunks to 0.  The page is actually freed
 324  * once both buddies are evicted (see zbud_reclaim_page() below).
 325  */
 326 void zbud_free(struct zbud_pool *pool, unsigned long handle)
 327 {
 328         struct zbud_header *zhdr;
 329         int freechunks;
 330
 331         spin_lock(&pool->lock);
 332         zhdr = handle_to_zbud_header(handle);
 333
 334         /* If first buddy, handle will be page aligned */
 335         if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
 336                 zhdr->last_chunks = 0;
 337         else
 338                 zhdr->first_chunks = 0;
 339
 340         if (zhdr->under_reclaim) {
 341                 /* zbud page is under reclaim, reclaim will free */
 342                 spin_unlock(&pool->lock);
 343                 return;
 344         }
 345
 346         /* Remove from existing buddy list */
 347         list_del(&zhdr->buddy);
 348
 349         if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
 350                 /* zbud page is empty, free */
 351                 list_del(&zhdr->lru);
 352                 free_zbud_page(zhdr);
 353                 pool->pages_nr--;
 354         } else {
 355                 /* Add to unbuddied list */
 356                 freechunks = num_free_chunks(zhdr);
 357                 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
 358         }
 359
 360         spin_unlock(&pool->lock);
 361 }
 362
 363 #define list_tail_entry(ptr, type, member) \
 364         list_entry((ptr)->prev, type, member)
 365
 366 /**
 367  * zbud_reclaim_page() - evicts allocations from a pool page and frees it
 368  * @pool:       pool from which a page will attempt to be evicted
 369  * @retires:    number of pages on the LRU list for which eviction will
 370  *              be attempted before failing
 371  *
 372  * zbud reclaim is different from normal system reclaim in that the reclaim is
 373  * done from the bottom, up.  This is because only the bottom layer, zbud, has
 374  * information on how the allocations are organized within each zbud page. This
 375  * has the potential to create interesting locking situations between zbud and
 376  * the user, however.
 377  *
 378  * To avoid these, this is how zbud_reclaim_page() should be called:
 379
 380  * The user detects a page should be reclaimed and calls zbud_reclaim_page().
 381  * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
 382  * the user-defined eviction handler with the pool and handle as arguments.
 383  *
 384  * If the handle can not be evicted, the eviction handler should return
 385  * non-zero. zbud_reclaim_page() will add the zbud page back to the
 386  * appropriate list and try the next zbud page on the LRU up to
 387  * a user defined number of retries.
 388  *
 389  * If the handle is successfully evicted, the eviction handler should
 390  * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
 391  * contains logic to delay freeing the page if the page is under reclaim,
 392  * as indicated by the setting of the PG_reclaim flag on the underlying page.
 393  *
 394  * If all buddies in the zbud page are successfully evicted, then the
 395  * zbud page can be freed.
 396  *
 397  * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
 398  * no pages to evict or an eviction handler is not registered, -EAGAIN if
 399  * the retry limit was hit.
 400  */
 401 int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
 402 {
 403         int i, ret, freechunks;
 404         struct zbud_header *zhdr;
 405         unsigned long first_handle = 0, last_handle = 0;
 406
 407         spin_lock(&pool->lock);
 408         if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
 409                         retries == 0) {
 410                 spin_unlock(&pool->lock);
 411                 return -EINVAL;
 412         }
 413         for (i = 0; i < retries; i++) {
 414                 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
 415                 list_del(&zhdr->lru);
 416                 list_del(&zhdr->buddy);
 417                 /* Protect zbud page against free */
 418                 zhdr->under_reclaim = true;
 419                 /*
 420                  * We need encode the handles before unlocking, since we can
 421                  * race with free that will set (first|last)_chunks to 0
 422                  */
 423                 first_handle = 0;
 424                 last_handle = 0;
 425                 if (zhdr->first_chunks)
 426                         first_handle = encode_handle(zhdr, FIRST);
 427                 if (zhdr->last_chunks)
 428                         last_handle = encode_handle(zhdr, LAST);
 429                 spin_unlock(&pool->lock);
 430
 431                 /* Issue the eviction callback(s) */
 432                 if (first_handle) {
 433                         ret = pool->ops->evict(pool, first_handle);
 434                         if (ret)
 435                                 goto next;
 436                 }
 437                 if (last_handle) {
 438                         ret = pool->ops->evict(pool, last_handle);
 439                         if (ret)
 440                                 goto next;
 441                 }
 442 next:
 443                 spin_lock(&pool->lock);
 444                 zhdr->under_reclaim = false;
 445                 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
 446                         /*
 447                          * Both buddies are now free, free the zbud page and
 448                          * return success.
 449                          */
 450                         free_zbud_page(zhdr);
 451                         pool->pages_nr--;
 452                         spin_unlock(&pool->lock);
 453                         return 0;
 454                 } else if (zhdr->first_chunks == 0 ||
 455                                 zhdr->last_chunks == 0) {
 456                         /* add to unbuddied list */
 457                         freechunks = num_free_chunks(zhdr);
 458                         list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
 459                 } else {
 460                         /* add to buddied list */
 461                         list_add(&zhdr->buddy, &pool->buddied);
 462                 }
 463
 464                 /* add to beginning of LRU */
 465                 list_add(&zhdr->lru, &pool->lru);
 466         }
 467         spin_unlock(&pool->lock);
 468         return -EAGAIN;
 469 }
 470
 471 /**
 472  * zbud_map() - maps the allocation associated with the given handle
 473  * @pool:       pool in which the allocation resides
 474  * @handle:     handle associated with the allocation to be mapped
 475  *
 476  * While trivial for zbud, the mapping functions for others allocators
 477  * implementing this allocation API could have more complex information encoded
 478  * in the handle and could create temporary mappings to make the data
 479  * accessible to the user.
 480  *
 481  * Returns: a pointer to the mapped allocation
 482  */
 483 void *zbud_map(struct zbud_pool *pool, unsigned long handle)
 484 {
 485         return (void *)(handle);
 486 }
 487
 488 /**
 489  * zbud_unmap() - maps the allocation associated with the given handle
 490  * @pool:       pool in which the allocation resides
 491  * @handle:     handle associated with the allocation to be unmapped
 492  */
 493 void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
 494 {
 495 }
 496
 497 /**
 498  * zbud_get_pool_size() - gets the zbud pool size in pages
 499  * @pool:       pool whose size is being queried
 500  *
 501  * Returns: size in pages of the given pool.  The pool lock need not be
 502  * taken to access pages_nr.
 503  */
 504 u64 zbud_get_pool_size(struct zbud_pool *pool)
 505 {
 506         return pool->pages_nr;
 507 }
 508
 509 static int __init init_zbud(void)
 510 {
 511         /* Make sure the zbud header will fit in one chunk */
 512         BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
 513         pr_info("loaded\n");
 514         return 0;
 515 }
 516
 517 static void __exit exit_zbud(void)
 518 {
 519         pr_info("unloaded\n");
 520 }
 521
 522 module_init(init_zbud);
 523 module_exit(exit_zbud);
 524
 525 MODULE_LICENSE("GPL");
 526 MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
 527 MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");