mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  */
   7
   8 #include <linux/config.h>
   9 #include <linux/mm.h>
  10 #include <linux/sched.h>
  11 #include <linux/head.h>
  12 #include <linux/kernel.h>
  13 #include <linux/kernel_stat.h>
  14 #include <linux/errno.h>
  15 #include <linux/string.h>
  16 #include <linux/stat.h>
  17 #include <linux/swap.h>
  18 #include <linux/fs.h>
  19 #include <linux/swapctl.h>
  20 #include <linux/interrupt.h>
  21 #include <linux/init.h>
  22 #include <linux/pagemap.h>
  23
  24 #include <asm/dma.h>
  25 #include <asm/system.h> /* for cli()/sti() */
  26 #include <asm/uaccess.h> /* for copy_to/from_user */
  27 #include <asm/bitops.h>
  28 #include <asm/pgtable.h>
  29 #include <asm/spinlock.h>
  30
  31 int nr_swap_pages = 0;
  32 int nr_free_pages = 0;
  33
  34 /*
  35  * Free area management
  36  *
  37  * The free_area_list arrays point to the queue heads of the free areas
  38  * of different sizes
  39  */
  40
  41 #if CONFIG_AP1000
  42 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
  43    for the ring buffers */
  44 #define NR_MEM_LISTS 12
  45 #else
  46 #define NR_MEM_LISTS 6
  47 #endif
  48
  49 /* The start of this MUST match the start of "struct page" */
  50 struct free_area_struct {
  51         struct page *next;
  52         struct page *prev;
  53         unsigned int * map;
  54 };
  55
  56 #define memory_head(x) ((struct page *)(x))
  57
  58 static struct free_area_struct free_area[NR_MEM_LISTS];
  59
  60 static inline void init_mem_queue(struct free_area_struct * head)
  61 {
  62         head->next = memory_head(head);
  63         head->prev = memory_head(head);
  64 }
  65
  66 static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
  67 {
  68         struct page * next = head->next;
  69
  70         entry->prev = memory_head(head);
  71         entry->next = next;
  72         next->prev = entry;
  73         head->next = entry;
  74 }
  75
  76 static inline void remove_mem_queue(struct page * entry)
  77 {
  78         struct page * next = entry->next;
  79         struct page * prev = entry->prev;
  80         next->prev = prev;
  81         prev->next = next;
  82 }
  83
  84 /*
  85  * Free_page() adds the page to the free lists. This is optimized for
  86  * fast normal cases (no error jumps taken normally).
  87  *
  88  * The way to optimize jumps for gcc-2.2.2 is to:
  89  *  - select the "normal" case and put it inside the if () { XXX }
  90  *  - no else-statements if you can avoid them
  91  *
  92  * With the above two rules, you get a straight-line execution path
  93  * for the normal case, giving better asm-code.
  94  */
  95
  96 /*
  97  * Buddy system. Hairy. You really aren't expected to understand this
  98  *
  99  * Hint: -mask = 1+~mask
 100  */
 101 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 102
 103 /*
 104  * This routine is used by the kernel swap daemon to determine
 105  * whether we have "enough" free pages. It is fairly arbitrary,
 106  * having a low-water and high-water mark.
 107  *
 108  * This returns:
 109  *  0 - urgent need for memory
 110  *  1 - need some memory, but do it slowly in the background
 111  *  2 - no need to even think about it.
 112  */
 113 int free_memory_available(void)
 114 {
 115         static int available = 1;
 116
 117         if (nr_free_pages < freepages.low) {
 118                 available = 0;
 119                 return 0;
 120         }
 121
 122         if (nr_free_pages > freepages.high) {
 123                 available = 1;
 124                 return 2;
 125         }
 126
 127         return available;
 128 }
 129
 130 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 131 {
 132         struct free_area_struct *area = free_area + order;
 133         unsigned long index = map_nr >> (1 + order);
 134         unsigned long mask = (~0UL) << order;
 135         unsigned long flags;
 136
 137         spin_lock_irqsave(&page_alloc_lock, flags);
 138
 139 #define list(x) (mem_map+(x))
 140
 141         map_nr &= mask;
 142         nr_free_pages -= mask;
 143         while (mask + (1 << (NR_MEM_LISTS-1))) {
 144                 if (!test_and_change_bit(index, area->map))
 145                         break;
 146                 remove_mem_queue(list(map_nr ^ -mask));
 147                 mask <<= 1;
 148                 area++;
 149                 index >>= 1;
 150                 map_nr &= mask;
 151         }
 152         add_mem_queue(area, list(map_nr));
 153
 154 #undef list
 155
 156         spin_unlock_irqrestore(&page_alloc_lock, flags);
 157 }
 158
 159 void __free_page(struct page *page)
 160 {
 161         if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 162                 if (PageSwapCache(page))
 163                         panic ("Freeing swap cache page");
 164                 free_pages_ok(page->map_nr, 0);
 165                 return;
 166         }
 167         if (PageSwapCache(page) && atomic_read(&page->count) == 1)
 168                 printk(KERN_WARNING "VM: Releasing swap cache page at %p",
 169                         __builtin_return_address(0));
 170 }
 171
 172 void free_pages(unsigned long addr, unsigned long order)
 173 {
 174         unsigned long map_nr = MAP_NR(addr);
 175
 176         if (map_nr < max_mapnr) {
 177                 mem_map_t * map = mem_map + map_nr;
 178                 if (PageReserved(map))
 179                         return;
 180                 if (atomic_dec_and_test(&map->count)) {
 181                         if (PageSwapCache(map))
 182                                 panic ("Freeing swap cache pages");
 183                         free_pages_ok(map_nr, order);
 184                         return;
 185                 }
 186                 if (PageSwapCache(map) && atomic_read(&map->count) == 1)
 187                         printk(KERN_WARNING
 188                                 "VM: Releasing swap cache pages at %p",
 189                                 __builtin_return_address(0));
 190         }
 191 }
 192
 193 /*
 194  * Some ugly macros to speed up __get_free_pages()..
 195  */
 196 #define MARK_USED(index, order, area) \
 197         change_bit((index) >> (1+(order)), (area)->map)
 198 #define CAN_DMA(x) (PageDMA(x))
 199 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
 200 #define RMQUEUE(order, dma) \
 201 do { struct free_area_struct * area = free_area+order; \
 202      unsigned long new_order = order; \
 203         do { struct page *prev = memory_head(area), *ret = prev->next; \
 204                 while (memory_head(area) != ret) { \
 205                         if (!dma || CAN_DMA(ret)) { \
 206                                 unsigned long map_nr = ret->map_nr; \
 207                                 (prev->next = ret->next)->prev = prev; \
 208                                 MARK_USED(map_nr, new_order, area); \
 209                                 nr_free_pages -= 1 << order; \
 210                                 EXPAND(ret, map_nr, order, new_order, area); \
 211                                 spin_unlock_irqrestore(&page_alloc_lock, flags); \
 212                                 return ADDRESS(map_nr); \
 213                         } \
 214                         prev = ret; \
 215                         ret = ret->next; \
 216                 } \
 217                 new_order++; area++; \
 218         } while (new_order < NR_MEM_LISTS); \
 219 } while (0)
 220
 221 #define EXPAND(map,index,low,high,area) \
 222 do { unsigned long size = 1 << high; \
 223         while (high > low) { \
 224                 area--; high--; size >>= 1; \
 225                 add_mem_queue(area, map); \
 226                 MARK_USED(index, high, area); \
 227                 index += size; \
 228                 map += size; \
 229         } \
 230         atomic_set(&map->count, 1); \
 231         map->age = PAGE_INITIAL_AGE; \
 232 } while (0)
 233
 234 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 235 {
 236         unsigned long flags;
 237
 238         if (order >= NR_MEM_LISTS)
 239                 goto nopage;
 240
 241         if (gfp_mask & __GFP_WAIT) {
 242                 if (in_interrupt()) {
 243                         static int count = 0;
 244                         if (++count < 5) {
 245                                 printk("gfp called nonatomically from interrupt %p\n",
 246                                         __builtin_return_address(0));
 247                         }
 248                         goto nopage;
 249                 }
 250
 251                 if (freepages.min > nr_free_pages) {
 252                         int freed;
 253                         freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
 254                         /*
 255                          * Low priority (user) allocations must not
 256                          * succeed if we didn't have enough memory
 257                          * and we couldn't get more..
 258                          */
 259                         if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 260                                 goto nopage;
 261                 }
 262         }
 263         spin_lock_irqsave(&page_alloc_lock, flags);
 264         RMQUEUE(order, (gfp_mask & GFP_DMA));
 265         spin_unlock_irqrestore(&page_alloc_lock, flags);
 266
 267         /*
 268          * If we failed to find anything, we'll return NULL, but we'll
 269          * wake up kswapd _now_ ad even wait for it synchronously if
 270          * we can.. This way we'll at least make some forward progress
 271          * over time.
 272          */
 273         wake_up(&kswapd_wait);
 274         if (gfp_mask & __GFP_WAIT)
 275                 schedule();
 276 nopage:
 277         return 0;
 278 }
 279
 280 /*
 281  * Show free area list (used inside shift_scroll-lock stuff)
 282  * We also calculate the percentage fragmentation. We do this by counting the
 283  * memory on each free list with the exception of the first item on the list.
 284  */
 285 void show_free_areas(void)
 286 {
 287         unsigned long order, flags;
 288         unsigned long total = 0;
 289
 290         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 291         printk("Free: %d (%d %d %d)\n",
 292                 nr_free_pages,
 293                 freepages.min,
 294                 freepages.low,
 295                 freepages.high);
 296         spin_lock_irqsave(&page_alloc_lock, flags);
 297         for (order=0 ; order < NR_MEM_LISTS; order++) {
 298                 struct page * tmp;
 299                 unsigned long nr = 0;
 300                 for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
 301                         nr ++;
 302                 }
 303                 total += nr * ((PAGE_SIZE>>10) << order);
 304                 printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
 305         }
 306         spin_unlock_irqrestore(&page_alloc_lock, flags);
 307         printk("= %lukB)\n", total);
 308 #ifdef SWAP_CACHE_INFO
 309         show_swap_cache_info();
 310 #endif
 311 }
 312
 313 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 314
 315 /*
 316  * set up the free-area data structures:
 317  *   - mark all pages reserved
 318  *   - mark all memory queues empty
 319  *   - clear the memory bitmaps
 320  */
 321 unsigned long __init free_area_init(unsigned long start_mem, unsigned long end_mem)
 322 {
 323         mem_map_t * p;
 324         unsigned long mask = PAGE_MASK;
 325         unsigned long i;
 326
 327         /*
 328          * Select nr of pages we try to keep free for important stuff
 329          * with a minimum of 10 pages and a maximum of 256 pages, so
 330          * that we don't waste too much memory on large systems.
 331          * This is fairly arbitrary, but based on some behaviour
 332          * analysis.
 333          */
 334         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
 335         if (i < 10)
 336                 i = 10;
 337         if (i > 256)
 338                 i = 256;
 339         freepages.min = i;
 340         freepages.low = i * 2;
 341         freepages.high = i * 3;
 342         mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
 343         p = mem_map + MAP_NR(end_mem);
 344         start_mem = LONG_ALIGN((unsigned long) p);
 345         memset(mem_map, 0, start_mem - (unsigned long) mem_map);
 346         do {
 347                 --p;
 348                 atomic_set(&p->count, 0);
 349                 p->flags = (1 << PG_DMA) | (1 << PG_reserved);
 350                 p->map_nr = p - mem_map;
 351         } while (p > mem_map);
 352
 353         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
 354                 unsigned long bitmap_size;
 355                 init_mem_queue(free_area+i);
 356                 mask += mask;
 357                 end_mem = (end_mem + ~mask) & mask;
 358                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
 359                 bitmap_size = (bitmap_size + 7) >> 3;
 360                 bitmap_size = LONG_ALIGN(bitmap_size);
 361                 free_area[i].map = (unsigned int *) start_mem;
 362                 memset((void *) start_mem, 0, bitmap_size);
 363                 start_mem += bitmap_size;
 364         }
 365         return start_mem;
 366 }
 367
 368 /*
 369  * The tests may look silly, but it essentially makes sure that
 370  * no other process did a swap-in on us just as we were waiting.
 371  *
 372  * Also, don't bother to add to the swap cache if this page-in
 373  * was due to a write access.
 374  */
 375 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 376         pte_t * page_table, unsigned long entry, int write_access)
 377 {
 378         unsigned long page;
 379         struct page *page_map;
 380
 381         page_map = read_swap_cache(entry);
 382
 383         if (pte_val(*page_table) != entry) {
 384                 if (page_map)
 385                         free_page_and_swap_cache(page_address(page_map));
 386                 return;
 387         }
 388         if (!page_map) {
 389                 set_pte(page_table, BAD_PAGE);
 390                 swap_free(entry);
 391                 oom(tsk);
 392                 return;
 393         }
 394
 395         page = page_address(page_map);
 396         vma->vm_mm->rss++;
 397         tsk->min_flt++;
 398         swap_free(entry);
 399
 400         if (!write_access || is_page_shared(page_map)) {
 401                 set_pte(page_table, mk_pte(page, vma->vm_page_prot));
 402                 return;
 403         }
 404
 405         /* The page is unshared, and we want write access.  In this
 406            case, it is safe to tear down the swap cache and give the
 407            page over entirely to this process. */
 408
 409         delete_from_swap_cache(page_map);
 410         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
 411         return;
 412 }