mm/percpu.c

   1 /*
   2  * linux/mm/percpu.c - percpu memory allocator
   3  *
   4  * Copyright (C) 2009           SUSE Linux Products GmbH
   5  * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
   6  *
   7  * This file is released under the GPLv2.
   8  *
   9  * This is percpu allocator which can handle both static and dynamic
  10  * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
  11  * chunk is consisted of num_possible_cpus() units and the first chunk
  12  * is used for static percpu variables in the kernel image (special
  13  * boot time alloc/init handling necessary as these areas need to be
  14  * brought up before allocation services are running).  Unit grows as
  15  * necessary and all units grow or shrink in unison.  When a chunk is
  16  * filled up, another chunk is allocated.  ie. in vmalloc area
  17  *
  18  *  c0                           c1                         c2
  19  *  -------------------          -------------------        ------------
  20  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
  21  *  -------------------  ......  -------------------  ....  ------------
  22  *
  23  * Allocation is done in offset-size areas of single unit space.  Ie,
  24  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  25  * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
  26  * percpu base registers UNIT_SIZE apart.
  27  *
  28  * There are usually many small percpu allocations many of them as
  29  * small as 4 bytes.  The allocator organizes chunks into lists
  30  * according to free size and tries to allocate from the fullest one.
  31  * Each chunk keeps the maximum contiguous area size hint which is
  32  * guaranteed to be eqaul to or larger than the maximum contiguous
  33  * area in the chunk.  This helps the allocator not to iterate the
  34  * chunk maps unnecessarily.
  35  *
  36  * Allocation state in each chunk is kept using an array of integers
  37  * on chunk->map.  A positive value in the map represents a free
  38  * region and negative allocated.  Allocation inside a chunk is done
  39  * by scanning this map sequentially and serving the first matching
  40  * entry.  This is mostly copied from the percpu_modalloc() allocator.
  41  * Chunks are also linked into a rb tree to ease address to chunk
  42  * mapping during free.
  43  *
  44  * To use this allocator, arch code should do the followings.
  45  *
  46  * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  47  *
  48  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  49  *   regular address to percpu pointer and back
  50  *
  51  * - use pcpu_setup_first_chunk() during percpu area initialization to
  52  *   setup the first chunk containing the kernel static percpu area
  53  */
  54
  55 #include <linux/bitmap.h>
  56 #include <linux/bootmem.h>
  57 #include <linux/list.h>
  58 #include <linux/mm.h>
  59 #include <linux/module.h>
  60 #include <linux/mutex.h>
  61 #include <linux/percpu.h>
  62 #include <linux/pfn.h>
  63 #include <linux/rbtree.h>
  64 #include <linux/slab.h>
  65 #include <linux/vmalloc.h>
  66
  67 #include <asm/cacheflush.h>
  68 #include <asm/tlbflush.h>
  69
  70 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
  71 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
  72
  73 struct pcpu_chunk {
  74         struct list_head        list;           /* linked to pcpu_slot lists */
  75         struct rb_node          rb_node;        /* key is chunk->vm->addr */
  76         int                     free_size;      /* free bytes in the chunk */
  77         int                     contig_hint;    /* max contiguous size hint */
  78         struct vm_struct        *vm;            /* mapped vmalloc region */
  79         int                     map_used;       /* # of map entries used */
  80         int                     map_alloc;      /* # of map entries allocated */
  81         int                     *map;           /* allocation map */
  82         bool                    immutable;      /* no [de]population allowed */
  83         struct page             *page[];        /* #cpus * UNIT_PAGES */
  84 };
  85
  86 static int pcpu_unit_pages __read_mostly;
  87 static int pcpu_unit_size __read_mostly;
  88 static int pcpu_chunk_size __read_mostly;
  89 static int pcpu_nr_slots __read_mostly;
  90 static size_t pcpu_chunk_struct_size __read_mostly;
  91
  92 /* the address of the first chunk which starts with the kernel static area */
  93 void *pcpu_base_addr __read_mostly;
  94 EXPORT_SYMBOL_GPL(pcpu_base_addr);
  95
  96 /*
  97  * One mutex to rule them all.
  98  *
  99  * The following mutex is grabbed in the outermost public alloc/free
 100  * interface functions and released only when the operation is
 101  * complete.  As such, every function in this file other than the
 102  * outermost functions are called under pcpu_mutex.
 103  *
 104  * It can easily be switched to use spinlock such that only the area
 105  * allocation and page population commit are protected with it doing
 106  * actual [de]allocation without holding any lock.  However, given
 107  * what this allocator does, I think it's better to let them run
 108  * sequentially.
 109  */
 110 static DEFINE_MUTEX(pcpu_mutex);
 111
 112 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 113 static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
 114
 115 static int __pcpu_size_to_slot(int size)
 116 {
 117         int highbit = fls(size);        /* size is in bytes */
 118         return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 119 }
 120
 121 static int pcpu_size_to_slot(int size)
 122 {
 123         if (size == pcpu_unit_size)
 124                 return pcpu_nr_slots - 1;
 125         return __pcpu_size_to_slot(size);
 126 }
 127
 128 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 129 {
 130         if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
 131                 return 0;
 132
 133         return pcpu_size_to_slot(chunk->free_size);
 134 }
 135
 136 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 137 {
 138         return cpu * pcpu_unit_pages + page_idx;
 139 }
 140
 141 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
 142                                       unsigned int cpu, int page_idx)
 143 {
 144         return &chunk->page[pcpu_page_idx(cpu, page_idx)];
 145 }
 146
 147 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 148                                      unsigned int cpu, int page_idx)
 149 {
 150         return (unsigned long)chunk->vm->addr +
 151                 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 152 }
 153
 154 static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 155                                      int page_idx)
 156 {
 157         return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 158 }
 159
 160 /**
 161  * pcpu_realloc - versatile realloc
 162  * @p: the current pointer (can be NULL for new allocations)
 163  * @size: the current size in bytes (can be 0 for new allocations)
 164  * @new_size: the wanted new size in bytes (can be 0 for free)
 165  *
 166  * More robust realloc which can be used to allocate, resize or free a
 167  * memory area of arbitrary size.  If the needed size goes over
 168  * PAGE_SIZE, kernel VM is used.
 169  *
 170  * RETURNS:
 171  * The new pointer on success, NULL on failure.
 172  */
 173 static void *pcpu_realloc(void *p, size_t size, size_t new_size)
 174 {
 175         void *new;
 176
 177         if (new_size <= PAGE_SIZE)
 178                 new = kmalloc(new_size, GFP_KERNEL);
 179         else
 180                 new = vmalloc(new_size);
 181         if (new_size && !new)
 182                 return NULL;
 183
 184         memcpy(new, p, min(size, new_size));
 185         if (new_size > size)
 186                 memset(new + size, 0, new_size - size);
 187
 188         if (size <= PAGE_SIZE)
 189                 kfree(p);
 190         else
 191                 vfree(p);
 192
 193         return new;
 194 }
 195
 196 /**
 197  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 198  * @chunk: chunk of interest
 199  * @oslot: the previous slot it was on
 200  *
 201  * This function is called after an allocation or free changed @chunk.
 202  * New slot according to the changed state is determined and @chunk is
 203  * moved to the slot.
 204  */
 205 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 206 {
 207         int nslot = pcpu_chunk_slot(chunk);
 208
 209         if (oslot != nslot) {
 210                 if (oslot < nslot)
 211                         list_move(&chunk->list, &pcpu_slot[nslot]);
 212                 else
 213                         list_move_tail(&chunk->list, &pcpu_slot[nslot]);
 214         }
 215 }
 216
 217 static struct rb_node **pcpu_chunk_rb_search(void *addr,
 218                                              struct rb_node **parentp)
 219 {
 220         struct rb_node **p = &pcpu_addr_root.rb_node;
 221         struct rb_node *parent = NULL;
 222         struct pcpu_chunk *chunk;
 223
 224         while (*p) {
 225                 parent = *p;
 226                 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
 227
 228                 if (addr < chunk->vm->addr)
 229                         p = &(*p)->rb_left;
 230                 else if (addr > chunk->vm->addr)
 231                         p = &(*p)->rb_right;
 232                 else
 233                         break;
 234         }
 235
 236         if (parentp)
 237                 *parentp = parent;
 238         return p;
 239 }
 240
 241 /**
 242  * pcpu_chunk_addr_search - search for chunk containing specified address
 243  * @addr: address to search for
 244  *
 245  * Look for chunk which might contain @addr.  More specifically, it
 246  * searchs for the chunk with the highest start address which isn't
 247  * beyond @addr.
 248  *
 249  * RETURNS:
 250  * The address of the found chunk.
 251  */
 252 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 253 {
 254         struct rb_node *n, *parent;
 255         struct pcpu_chunk *chunk;
 256
 257         n = *pcpu_chunk_rb_search(addr, &parent);
 258         if (!n) {
 259                 /* no exactly matching chunk, the parent is the closest */
 260                 n = parent;
 261                 BUG_ON(!n);
 262         }
 263         chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 264
 265         if (addr < chunk->vm->addr) {
 266                 /* the parent was the next one, look for the previous one */
 267                 n = rb_prev(n);
 268                 BUG_ON(!n);
 269                 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 270         }
 271
 272         return chunk;
 273 }
 274
 275 /**
 276  * pcpu_chunk_addr_insert - insert chunk into address rb tree
 277  * @new: chunk to insert
 278  *
 279  * Insert @new into address rb tree.
 280  */
 281 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 282 {
 283         struct rb_node **p, *parent;
 284
 285         p = pcpu_chunk_rb_search(new->vm->addr, &parent);
 286         BUG_ON(*p);
 287         rb_link_node(&new->rb_node, parent, p);
 288         rb_insert_color(&new->rb_node, &pcpu_addr_root);
 289 }
 290
 291 /**
 292  * pcpu_split_block - split a map block
 293  * @chunk: chunk of interest
 294  * @i: index of map block to split
 295  * @head: head size in bytes (can be 0)
 296  * @tail: tail size in bytes (can be 0)
 297  *
 298  * Split the @i'th map block into two or three blocks.  If @head is
 299  * non-zero, @head bytes block is inserted before block @i moving it
 300  * to @i+1 and reducing its size by @head bytes.
 301  *
 302  * If @tail is non-zero, the target block, which can be @i or @i+1
 303  * depending on @head, is reduced by @tail bytes and @tail byte block
 304  * is inserted after the target block.
 305  *
 306  * RETURNS:
 307  * 0 on success, -errno on failure.
 308  */
 309 static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 310 {
 311         int nr_extra = !!head + !!tail;
 312         int target = chunk->map_used + nr_extra;
 313
 314         /* reallocation required? */
 315         if (chunk->map_alloc < target) {
 316                 int new_alloc;
 317                 int *new;
 318
 319                 new_alloc = PCPU_DFL_MAP_ALLOC;
 320                 while (new_alloc < target)
 321                         new_alloc *= 2;
 322
 323                 if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
 324                         /*
 325                          * map_alloc smaller than the default size
 326                          * indicates that the chunk is one of the
 327                          * first chunks and still using static map.
 328                          * Allocate a dynamic one and copy.
 329                          */
 330                         new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
 331                         if (new)
 332                                 memcpy(new, chunk->map,
 333                                        chunk->map_alloc * sizeof(new[0]));
 334                 } else
 335                         new = pcpu_realloc(chunk->map,
 336                                            chunk->map_alloc * sizeof(new[0]),
 337                                            new_alloc * sizeof(new[0]));
 338                 if (!new)
 339                         return -ENOMEM;
 340
 341                 chunk->map_alloc = new_alloc;
 342                 chunk->map = new;
 343         }
 344
 345         /* insert a new subblock */
 346         memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 347                 sizeof(chunk->map[0]) * (chunk->map_used - i));
 348         chunk->map_used += nr_extra;
 349
 350         if (head) {
 351                 chunk->map[i + 1] = chunk->map[i] - head;
 352                 chunk->map[i++] = head;
 353         }
 354         if (tail) {
 355                 chunk->map[i++] -= tail;
 356                 chunk->map[i] = tail;
 357         }
 358         return 0;
 359 }
 360
 361 /**
 362  * pcpu_alloc_area - allocate area from a pcpu_chunk
 363  * @chunk: chunk of interest
 364  * @size: wanted size in bytes
 365  * @align: wanted align
 366  *
 367  * Try to allocate @size bytes area aligned at @align from @chunk.
 368  * Note that this function only allocates the offset.  It doesn't
 369  * populate or map the area.
 370  *
 371  * RETURNS:
 372  * Allocated offset in @chunk on success, -errno on failure.
 373  */
 374 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 375 {
 376         int oslot = pcpu_chunk_slot(chunk);
 377         int max_contig = 0;
 378         int i, off;
 379
 380         for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 381                 bool is_last = i + 1 == chunk->map_used;
 382                 int head, tail;
 383
 384                 /* extra for alignment requirement */
 385                 head = ALIGN(off, align) - off;
 386                 BUG_ON(i == 0 && head != 0);
 387
 388                 if (chunk->map[i] < 0)
 389                         continue;
 390                 if (chunk->map[i] < head + size) {
 391                         max_contig = max(chunk->map[i], max_contig);
 392                         continue;
 393                 }
 394
 395                 /*
 396                  * If head is small or the previous block is free,
 397                  * merge'em.  Note that 'small' is defined as smaller
 398                  * than sizeof(int), which is very small but isn't too
 399                  * uncommon for percpu allocations.
 400                  */
 401                 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
 402                         if (chunk->map[i - 1] > 0)
 403                                 chunk->map[i - 1] += head;
 404                         else {
 405                                 chunk->map[i - 1] -= head;
 406                                 chunk->free_size -= head;
 407                         }
 408                         chunk->map[i] -= head;
 409                         off += head;
 410                         head = 0;
 411                 }
 412
 413                 /* if tail is small, just keep it around */
 414                 tail = chunk->map[i] - head - size;
 415                 if (tail < sizeof(int))
 416                         tail = 0;
 417
 418                 /* split if warranted */
 419                 if (head || tail) {
 420                         if (pcpu_split_block(chunk, i, head, tail))
 421                                 return -ENOMEM;
 422                         if (head) {
 423                                 i++;
 424                                 off += head;
 425                                 max_contig = max(chunk->map[i - 1], max_contig);
 426                         }
 427                         if (tail)
 428                                 max_contig = max(chunk->map[i + 1], max_contig);
 429                 }
 430
 431                 /* update hint and mark allocated */
 432                 if (is_last)
 433                         chunk->contig_hint = max_contig; /* fully scanned */
 434                 else
 435                         chunk->contig_hint = max(chunk->contig_hint,
 436                                                  max_contig);
 437
 438                 chunk->free_size -= chunk->map[i];
 439                 chunk->map[i] = -chunk->map[i];
 440
 441                 pcpu_chunk_relocate(chunk, oslot);
 442                 return off;
 443         }
 444
 445         chunk->contig_hint = max_contig;        /* fully scanned */
 446         pcpu_chunk_relocate(chunk, oslot);
 447
 448         /*
 449          * Tell the upper layer that this chunk has no area left.
 450          * Note that this is not an error condition but a notification
 451          * to upper layer that it needs to look at other chunks.
 452          * -ENOSPC is chosen as it isn't used in memory subsystem and
 453          * matches the meaning in a way.
 454          */
 455         return -ENOSPC;
 456 }
 457
 458 /**
 459  * pcpu_free_area - free area to a pcpu_chunk
 460  * @chunk: chunk of interest
 461  * @freeme: offset of area to free
 462  *
 463  * Free area starting from @freeme to @chunk.  Note that this function
 464  * only modifies the allocation map.  It doesn't depopulate or unmap
 465  * the area.
 466  */
 467 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 468 {
 469         int oslot = pcpu_chunk_slot(chunk);
 470         int i, off;
 471
 472         for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
 473                 if (off == freeme)
 474                         break;
 475         BUG_ON(off != freeme);
 476         BUG_ON(chunk->map[i] > 0);
 477
 478         chunk->map[i] = -chunk->map[i];
 479         chunk->free_size += chunk->map[i];
 480
 481         /* merge with previous? */
 482         if (i > 0 && chunk->map[i - 1] >= 0) {
 483                 chunk->map[i - 1] += chunk->map[i];
 484                 chunk->map_used--;
 485                 memmove(&chunk->map[i], &chunk->map[i + 1],
 486                         (chunk->map_used - i) * sizeof(chunk->map[0]));
 487                 i--;
 488         }
 489         /* merge with next? */
 490         if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
 491                 chunk->map[i] += chunk->map[i + 1];
 492                 chunk->map_used--;
 493                 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
 494                         (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
 495         }
 496
 497         chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
 498         pcpu_chunk_relocate(chunk, oslot);
 499 }
 500
 501 /**
 502  * pcpu_unmap - unmap pages out of a pcpu_chunk
 503  * @chunk: chunk of interest
 504  * @page_start: page index of the first page to unmap
 505  * @page_end: page index of the last page to unmap + 1
 506  * @flush: whether to flush cache and tlb or not
 507  *
 508  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 509  * If @flush is true, vcache is flushed before unmapping and tlb
 510  * after.
 511  */
 512 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 513                        bool flush)
 514 {
 515         unsigned int last = num_possible_cpus() - 1;
 516         unsigned int cpu;
 517
 518         /* unmap must not be done on immutable chunk */
 519         WARN_ON(chunk->immutable);
 520
 521         /*
 522          * Each flushing trial can be very expensive, issue flush on
 523          * the whole region at once rather than doing it for each cpu.
 524          * This could be an overkill but is more scalable.
 525          */
 526         if (flush)
 527                 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
 528                                    pcpu_chunk_addr(chunk, last, page_end));
 529
 530         for_each_possible_cpu(cpu)
 531                 unmap_kernel_range_noflush(
 532                                 pcpu_chunk_addr(chunk, cpu, page_start),
 533                                 (page_end - page_start) << PAGE_SHIFT);
 534
 535         /* ditto as flush_cache_vunmap() */
 536         if (flush)
 537                 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
 538                                        pcpu_chunk_addr(chunk, last, page_end));
 539 }
 540
 541 /**
 542  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 543  * @chunk: chunk to depopulate
 544  * @off: offset to the area to depopulate
 545  * @size: size of the area to depopulate in bytes
 546  * @flush: whether to flush cache and tlb or not
 547  *
 548  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 549  * from @chunk.  If @flush is true, vcache is flushed before unmapping
 550  * and tlb after.
 551  */
 552 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 553                                   bool flush)
 554 {
 555         int page_start = PFN_DOWN(off);
 556         int page_end = PFN_UP(off + size);
 557         int unmap_start = -1;
 558         int uninitialized_var(unmap_end);
 559         unsigned int cpu;
 560         int i;
 561
 562         for (i = page_start; i < page_end; i++) {
 563                 for_each_possible_cpu(cpu) {
 564                         struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 565
 566                         if (!*pagep)
 567                                 continue;
 568
 569                         __free_page(*pagep);
 570
 571                         /*
 572                          * If it's partial depopulation, it might get
 573                          * populated or depopulated again.  Mark the
 574                          * page gone.
 575                          */
 576                         *pagep = NULL;
 577
 578                         unmap_start = unmap_start < 0 ? i : unmap_start;
 579                         unmap_end = i + 1;
 580                 }
 581         }
 582
 583         if (unmap_start >= 0)
 584                 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
 585 }
 586
 587 /**
 588  * pcpu_map - map pages into a pcpu_chunk
 589  * @chunk: chunk of interest
 590  * @page_start: page index of the first page to map
 591  * @page_end: page index of the last page to map + 1
 592  *
 593  * For each cpu, map pages [@page_start,@page_end) into @chunk.
 594  * vcache is flushed afterwards.
 595  */
 596 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 597 {
 598         unsigned int last = num_possible_cpus() - 1;
 599         unsigned int cpu;
 600         int err;
 601
 602         /* map must not be done on immutable chunk */
 603         WARN_ON(chunk->immutable);
 604
 605         for_each_possible_cpu(cpu) {
 606                 err = map_kernel_range_noflush(
 607                                 pcpu_chunk_addr(chunk, cpu, page_start),
 608                                 (page_end - page_start) << PAGE_SHIFT,
 609                                 PAGE_KERNEL,
 610                                 pcpu_chunk_pagep(chunk, cpu, page_start));
 611                 if (err < 0)
 612                         return err;
 613         }
 614
 615         /* flush at once, please read comments in pcpu_unmap() */
 616         flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 617                          pcpu_chunk_addr(chunk, last, page_end));
 618         return 0;
 619 }
 620
 621 /**
 622  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 623  * @chunk: chunk of interest
 624  * @off: offset to the area to populate
 625  * @size: size of the area to populate in bytes
 626  *
 627  * For each cpu, populate and map pages [@page_start,@page_end) into
 628  * @chunk.  The area is cleared on return.
 629  */
 630 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 631 {
 632         const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 633         int page_start = PFN_DOWN(off);
 634         int page_end = PFN_UP(off + size);
 635         int map_start = -1;
 636         int uninitialized_var(map_end);
 637         unsigned int cpu;
 638         int i;
 639
 640         for (i = page_start; i < page_end; i++) {
 641                 if (pcpu_chunk_page_occupied(chunk, i)) {
 642                         if (map_start >= 0) {
 643                                 if (pcpu_map(chunk, map_start, map_end))
 644                                         goto err;
 645                                 map_start = -1;
 646                         }
 647                         continue;
 648                 }
 649
 650                 map_start = map_start < 0 ? i : map_start;
 651                 map_end = i + 1;
 652
 653                 for_each_possible_cpu(cpu) {
 654                         struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 655
 656                         *pagep = alloc_pages_node(cpu_to_node(cpu),
 657                                                   alloc_mask, 0);
 658                         if (!*pagep)
 659                                 goto err;
 660                 }
 661         }
 662
 663         if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
 664                 goto err;
 665
 666         for_each_possible_cpu(cpu)
 667                 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 668                        size);
 669
 670         return 0;
 671 err:
 672         /* likely under heavy memory pressure, give memory back */
 673         pcpu_depopulate_chunk(chunk, off, size, true);
 674         return -ENOMEM;
 675 }
 676
 677 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 678 {
 679         if (!chunk)
 680                 return;
 681         if (chunk->vm)
 682                 free_vm_area(chunk->vm);
 683         pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
 684         kfree(chunk);
 685 }
 686
 687 static struct pcpu_chunk *alloc_pcpu_chunk(void)
 688 {
 689         struct pcpu_chunk *chunk;
 690
 691         chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
 692         if (!chunk)
 693                 return NULL;
 694
 695         chunk->map = pcpu_realloc(NULL, 0,
 696                                   PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 697         chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 698         chunk->map[chunk->map_used++] = pcpu_unit_size;
 699
 700         chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 701         if (!chunk->vm) {
 702                 free_pcpu_chunk(chunk);
 703                 return NULL;
 704         }
 705
 706         INIT_LIST_HEAD(&chunk->list);
 707         chunk->free_size = pcpu_unit_size;
 708         chunk->contig_hint = pcpu_unit_size;
 709
 710         return chunk;
 711 }
 712
 713 /**
 714  * __alloc_percpu - allocate percpu area
 715  * @size: size of area to allocate in bytes
 716  * @align: alignment of area (max PAGE_SIZE)
 717  *
 718  * Allocate percpu area of @size bytes aligned at @align.  Might
 719  * sleep.  Might trigger writeouts.
 720  *
 721  * RETURNS:
 722  * Percpu pointer to the allocated area on success, NULL on failure.
 723  */
 724 void *__alloc_percpu(size_t size, size_t align)
 725 {
 726         void *ptr = NULL;
 727         struct pcpu_chunk *chunk;
 728         int slot, off;
 729
 730         if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 731                 WARN(true, "illegal size (%zu) or align (%zu) for "
 732                      "percpu allocation\n", size, align);
 733                 return NULL;
 734         }
 735
 736         mutex_lock(&pcpu_mutex);
 737
 738         /* allocate area */
 739         for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 740                 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 741                         if (size > chunk->contig_hint)
 742                                 continue;
 743                         off = pcpu_alloc_area(chunk, size, align);
 744                         if (off >= 0)
 745                                 goto area_found;
 746                         if (off != -ENOSPC)
 747                                 goto out_unlock;
 748                 }
 749         }
 750
 751         /* hmmm... no space left, create a new chunk */
 752         chunk = alloc_pcpu_chunk();
 753         if (!chunk)
 754                 goto out_unlock;
 755         pcpu_chunk_relocate(chunk, -1);
 756         pcpu_chunk_addr_insert(chunk);
 757
 758         off = pcpu_alloc_area(chunk, size, align);
 759         if (off < 0)
 760                 goto out_unlock;
 761
 762 area_found:
 763         /* populate, map and clear the area */
 764         if (pcpu_populate_chunk(chunk, off, size)) {
 765                 pcpu_free_area(chunk, off);
 766                 goto out_unlock;
 767         }
 768
 769         ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
 770 out_unlock:
 771         mutex_unlock(&pcpu_mutex);
 772         return ptr;
 773 }
 774 EXPORT_SYMBOL_GPL(__alloc_percpu);
 775
 776 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 777 {
 778         WARN_ON(chunk->immutable);
 779         pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 780         list_del(&chunk->list);
 781         rb_erase(&chunk->rb_node, &pcpu_addr_root);
 782         free_pcpu_chunk(chunk);
 783 }
 784
 785 /**
 786  * free_percpu - free percpu area
 787  * @ptr: pointer to area to free
 788  *
 789  * Free percpu area @ptr.  Might sleep.
 790  */
 791 void free_percpu(void *ptr)
 792 {
 793         void *addr = __pcpu_ptr_to_addr(ptr);
 794         struct pcpu_chunk *chunk;
 795         int off;
 796
 797         if (!ptr)
 798                 return;
 799
 800         mutex_lock(&pcpu_mutex);
 801
 802         chunk = pcpu_chunk_addr_search(addr);
 803         off = addr - chunk->vm->addr;
 804
 805         pcpu_free_area(chunk, off);
 806
 807         /* the chunk became fully free, kill one if there are other free ones */
 808         if (chunk->free_size == pcpu_unit_size) {
 809                 struct pcpu_chunk *pos;
 810
 811                 list_for_each_entry(pos,
 812                                     &pcpu_slot[pcpu_chunk_slot(chunk)], list)
 813                         if (pos != chunk) {
 814                                 pcpu_kill_chunk(pos);
 815                                 break;
 816                         }
 817         }
 818
 819         mutex_unlock(&pcpu_mutex);
 820 }
 821 EXPORT_SYMBOL_GPL(free_percpu);
 822
 823 /**
 824  * pcpu_setup_first_chunk - initialize the first percpu chunk
 825  * @get_page_fn: callback to fetch page pointer
 826  * @static_size: the size of static percpu area in bytes
 827  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
 828  * @dyn_size: free size for dynamic allocation in bytes, 0 for auto
 829  * @base_addr: mapped address, NULL for auto
 830  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
 831  *
 832  * Initialize the first percpu chunk which contains the kernel static
 833  * perpcu area.  This function is to be called from arch percpu area
 834  * setup path.  The first two parameters are mandatory.  The rest are
 835  * optional.
 836  *
 837  * @get_page_fn() should return pointer to percpu page given cpu
 838  * number and page number.  It should at least return enough pages to
 839  * cover the static area.  The returned pages for static area should
 840  * have been initialized with valid data.  If @unit_size is specified,
 841  * it can also return pages after the static area.  NULL return
 842  * indicates end of pages for the cpu.  Note that @get_page_fn() must
 843  * return the same number of pages for all cpus.
 844  *
 845  * @unit_size, if non-zero, determines unit size and must be aligned
 846  * to PAGE_SIZE and equal to or larger than @static_size + @dyn_size.
 847  *
 848  * @dyn_size determines the number of free bytes after the static
 849  * area in the first chunk.  If zero, whatever left is available.
 850  * Specifying non-zero value make percpu leave the area after
 851  * @static_size + @dyn_size alone.
 852  *
 853  * Non-null @base_addr means that the caller already allocated virtual
 854  * region for the first chunk and mapped it.  percpu must not mess
 855  * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
 856  * @populate_pte_fn doesn't make any sense.
 857  *
 858  * @populate_pte_fn is used to populate the pagetable.  NULL means the
 859  * caller already populated the pagetable.
 860  *
 861  * RETURNS:
 862  * The determined pcpu_unit_size which can be used to initialize
 863  * percpu access.
 864  */
 865 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 866                                      size_t static_size, size_t unit_size,
 867                                      size_t dyn_size, void *base_addr,
 868                                      pcpu_populate_pte_fn_t populate_pte_fn)
 869 {
 870         static struct vm_struct first_vm;
 871         static int smap[2];
 872         struct pcpu_chunk *schunk;
 873         unsigned int cpu;
 874         int nr_pages;
 875         int err, i;
 876
 877         /* santiy checks */
 878         BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 879         BUG_ON(!static_size);
 880         BUG_ON(!unit_size && dyn_size);
 881         BUG_ON(unit_size && unit_size < static_size + dyn_size);
 882         BUG_ON(unit_size & ~PAGE_MASK);
 883         BUG_ON(base_addr && !unit_size);
 884         BUG_ON(base_addr && populate_pte_fn);
 885
 886         if (unit_size)
 887                 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 888         else
 889                 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 890                                         PFN_UP(static_size));
 891
 892         pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 893         pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 894         pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 895                 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 896
 897         /*
 898          * Allocate chunk slots.  The additional last slot is for
 899          * empty chunks.
 900          */
 901         pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 902         pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 903         for (i = 0; i < pcpu_nr_slots; i++)
 904                 INIT_LIST_HEAD(&pcpu_slot[i]);
 905
 906         /* init static chunk */
 907         schunk = alloc_bootmem(pcpu_chunk_struct_size);
 908         INIT_LIST_HEAD(&schunk->list);
 909         schunk->vm = &first_vm;
 910         schunk->map = smap;
 911         schunk->map_alloc = ARRAY_SIZE(smap);
 912
 913         if (dyn_size)
 914                 schunk->free_size = dyn_size;
 915         else
 916                 schunk->free_size = pcpu_unit_size - static_size;
 917
 918         schunk->contig_hint = schunk->free_size;
 919
 920         schunk->map[schunk->map_used++] = -static_size;
 921         if (schunk->free_size)
 922                 schunk->map[schunk->map_used++] = schunk->free_size;
 923
 924         /* allocate vm address */
 925         first_vm.flags = VM_ALLOC;
 926         first_vm.size = pcpu_chunk_size;
 927
 928         if (!base_addr)
 929                 vm_area_register_early(&first_vm, PAGE_SIZE);
 930         else {
 931                 /*
 932                  * Pages already mapped.  No need to remap into
 933                  * vmalloc area.  In this case the static chunk can't
 934                  * be mapped or unmapped by percpu and is marked
 935                  * immutable.
 936                  */
 937                 first_vm.addr = base_addr;
 938                 schunk->immutable = true;
 939         }
 940
 941         /* assign pages */
 942         nr_pages = -1;
 943         for_each_possible_cpu(cpu) {
 944                 for (i = 0; i < pcpu_unit_pages; i++) {
 945                         struct page *page = get_page_fn(cpu, i);
 946
 947                         if (!page)
 948                                 break;
 949                         *pcpu_chunk_pagep(schunk, cpu, i) = page;
 950                 }
 951
 952                 BUG_ON(i < PFN_UP(static_size));
 953
 954                 if (nr_pages < 0)
 955                         nr_pages = i;
 956                 else
 957                         BUG_ON(nr_pages != i);
 958         }
 959
 960         /* map them */
 961         if (populate_pte_fn) {
 962                 for_each_possible_cpu(cpu)
 963                         for (i = 0; i < nr_pages; i++)
 964                                 populate_pte_fn(pcpu_chunk_addr(schunk,
 965                                                                 cpu, i));
 966
 967                 err = pcpu_map(schunk, 0, nr_pages);
 968                 if (err)
 969                         panic("failed to setup static percpu area, err=%d\n",
 970                               err);
 971         }
 972
 973         /* link the first chunk in */
 974         pcpu_chunk_relocate(schunk, -1);
 975         pcpu_chunk_addr_insert(schunk);
 976
 977         /* we're done */
 978         pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 979         return pcpu_unit_size;
 980 }