mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  */
   7
   8 #include <linux/config.h>
   9 #include <linux/mm.h>
  10 #include <linux/sched.h>
  11 #include <linux/head.h>
  12 #include <linux/kernel.h>
  13 #include <linux/kernel_stat.h>
  14 #include <linux/errno.h>
  15 #include <linux/string.h>
  16 #include <linux/stat.h>
  17 #include <linux/swap.h>
  18 #include <linux/fs.h>
  19 #include <linux/swapctl.h>
  20 #include <linux/interrupt.h>
  21 #include <linux/init.h>
  22 #include <linux/pagemap.h>
  23
  24 #include <asm/dma.h>
  25 #include <asm/system.h> /* for cli()/sti() */
  26 #include <asm/uaccess.h> /* for copy_to/from_user */
  27 #include <asm/bitops.h>
  28 #include <asm/pgtable.h>
  29 #include <asm/spinlock.h>
  30
  31 int nr_swap_pages = 0;
  32 int nr_free_pages = 0;
  33
  34 /*
  35  * Free area management
  36  *
  37  * The free_area_list arrays point to the queue heads of the free areas
  38  * of different sizes
  39  */
  40
  41 #if CONFIG_AP1000
  42 /* the AP+ needs to allocate 8MB contiguous, aligned chunks of ram
  43    for the ring buffers */
  44 #define NR_MEM_LISTS 12
  45 #else
  46 #define NR_MEM_LISTS 6
  47 #endif
  48
  49 /* The start of this MUST match the start of "struct page" */
  50 struct free_area_struct {
  51         struct page *next;
  52         struct page *prev;
  53         unsigned int * map;
  54 };
  55
  56 #define memory_head(x) ((struct page *)(x))
  57
  58 static struct free_area_struct free_area[NR_MEM_LISTS];
  59
  60 static inline void init_mem_queue(struct free_area_struct * head)
  61 {
  62         head->next = memory_head(head);
  63         head->prev = memory_head(head);
  64 }
  65
  66 static inline void add_mem_queue(struct free_area_struct * head, struct page * entry)
  67 {
  68         struct page * next = head->next;
  69
  70         entry->prev = memory_head(head);
  71         entry->next = next;
  72         next->prev = entry;
  73         head->next = entry;
  74 }
  75
  76 static inline void remove_mem_queue(struct page * entry)
  77 {
  78         struct page * next = entry->next;
  79         struct page * prev = entry->prev;
  80         next->prev = prev;
  81         prev->next = next;
  82 }
  83
  84 /*
  85  * Free_page() adds the page to the free lists. This is optimized for
  86  * fast normal cases (no error jumps taken normally).
  87  *
  88  * The way to optimize jumps for gcc-2.2.2 is to:
  89  *  - select the "normal" case and put it inside the if () { XXX }
  90  *  - no else-statements if you can avoid them
  91  *
  92  * With the above two rules, you get a straight-line execution path
  93  * for the normal case, giving better asm-code.
  94  */
  95
  96 /*
  97  * Buddy system. Hairy. You really aren't expected to understand this
  98  *
  99  * Hint: -mask = 1+~mask
 100  */
 101 spinlock_t page_alloc_lock = SPIN_LOCK_UNLOCKED;
 102
 103 /*
 104  * This routine is used by the kernel swap daemon to determine
 105  * whether we have "enough" free pages. It is fairly arbitrary,
 106  * having a low-water and high-water mark.
 107  *
 108  * This returns:
 109  *  0 - urgent need for memory
 110  *  1 - need some memory, but do it slowly in the background
 111  *  2 - no need to even think about it.
 112  */
 113 int free_memory_available(void)
 114 {
 115         static int available = 1;
 116
 117         if (nr_free_pages < freepages.low) {
 118                 available = 0;
 119                 return 0;
 120         }
 121
 122         if (nr_free_pages > freepages.high) {
 123                 available = 1;
 124                 return 2;
 125         }
 126
 127         return available;
 128 }
 129
 130 static inline void free_pages_ok(unsigned long map_nr, unsigned long order)
 131 {
 132         struct free_area_struct *area = free_area + order;
 133         unsigned long index = map_nr >> (1 + order);
 134         unsigned long mask = (~0UL) << order;
 135         unsigned long flags;
 136
 137         spin_lock_irqsave(&page_alloc_lock, flags);
 138
 139 #define list(x) (mem_map+(x))
 140
 141         map_nr &= mask;
 142         nr_free_pages -= mask;
 143         while (mask + (1 << (NR_MEM_LISTS-1))) {
 144                 if (!test_and_change_bit(index, area->map))
 145                         break;
 146                 remove_mem_queue(list(map_nr ^ -mask));
 147                 mask <<= 1;
 148                 area++;
 149                 index >>= 1;
 150                 map_nr &= mask;
 151         }
 152         add_mem_queue(area, list(map_nr));
 153
 154 #undef list
 155
 156         spin_unlock_irqrestore(&page_alloc_lock, flags);
 157 }
 158
 159 void __free_page(struct page *page)
 160 {
 161         if (!PageReserved(page) && atomic_dec_and_test(&page->count)) {
 162                 if (PageSwapCache(page))
 163                         panic ("Freeing swap cache page");
 164                 free_pages_ok(page->map_nr, 0);
 165                 return;
 166         }
 167         if (PageSwapCache(page) && atomic_read(&page->count) == 1)
 168                 printk(KERN_WARNING "VM: Releasing swap cache page at %p",
 169                         __builtin_return_address(0));
 170 }
 171
 172 void free_pages(unsigned long addr, unsigned long order)
 173 {
 174         unsigned long map_nr = MAP_NR(addr);
 175
 176         if (map_nr < max_mapnr) {
 177                 mem_map_t * map = mem_map + map_nr;
 178                 if (PageReserved(map))
 179                         return;
 180                 if (atomic_dec_and_test(&map->count)) {
 181                         if (PageSwapCache(map))
 182                                 panic ("Freeing swap cache pages");
 183                         free_pages_ok(map_nr, order);
 184                         return;
 185                 }
 186                 if (PageSwapCache(map) && atomic_read(&map->count) == 1)
 187                         printk(KERN_WARNING
 188                                 "VM: Releasing swap cache pages at %p",
 189                                 __builtin_return_address(0));
 190         }
 191 }
 192
 193 /*
 194  * Some ugly macros to speed up __get_free_pages()..
 195  */
 196 #define MARK_USED(index, order, area) \
 197         change_bit((index) >> (1+(order)), (area)->map)
 198 #define CAN_DMA(x) (PageDMA(x))
 199 #define ADDRESS(x) (PAGE_OFFSET + ((x) << PAGE_SHIFT))
 200 #define RMQUEUE(order, dma) \
 201 do { struct free_area_struct * area = free_area+order; \
 202      unsigned long new_order = order; \
 203         do { struct page *prev = memory_head(area), *ret = prev->next; \
 204                 while (memory_head(area) != ret) { \
 205                         if (!dma || CAN_DMA(ret)) { \
 206                                 unsigned long map_nr = ret->map_nr; \
 207                                 (prev->next = ret->next)->prev = prev; \
 208                                 MARK_USED(map_nr, new_order, area); \
 209                                 nr_free_pages -= 1 << order; \
 210                                 EXPAND(ret, map_nr, order, new_order, area); \
 211                                 spin_unlock_irqrestore(&page_alloc_lock, flags); \
 212                                 return ADDRESS(map_nr); \
 213                         } \
 214                         prev = ret; \
 215                         ret = ret->next; \
 216                 } \
 217                 new_order++; area++; \
 218         } while (new_order < NR_MEM_LISTS); \
 219 } while (0)
 220
 221 #define EXPAND(map,index,low,high,area) \
 222 do { unsigned long size = 1 << high; \
 223         while (high > low) { \
 224                 area--; high--; size >>= 1; \
 225                 add_mem_queue(area, map); \
 226                 MARK_USED(index, high, area); \
 227                 index += size; \
 228                 map += size; \
 229         } \
 230         atomic_set(&map->count, 1); \
 231         map->age = PAGE_INITIAL_AGE; \
 232 } while (0)
 233
 234 unsigned long __get_free_pages(int gfp_mask, unsigned long order)
 235 {
 236         unsigned long flags;
 237
 238         if (order >= NR_MEM_LISTS)
 239                 goto nopage;
 240
 241         if (gfp_mask & __GFP_WAIT) {
 242                 __check_locks(1);
 243                 if (in_interrupt()) {
 244                         static int count = 0;
 245                         if (++count < 5) {
 246                                 printk("gfp called nonatomically from interrupt %p\n",
 247                                         __builtin_return_address(0));
 248                         }
 249                         goto nopage;
 250                 }
 251
 252                 if (freepages.min > nr_free_pages) {
 253                         int freed;
 254                         freed = try_to_free_pages(gfp_mask, SWAP_CLUSTER_MAX);
 255                         /*
 256                          * Low priority (user) allocations must not
 257                          * succeed if we didn't have enough memory
 258                          * and we couldn't get more..
 259                          */
 260                         if (!freed && !(gfp_mask & (__GFP_MED | __GFP_HIGH)))
 261                                 goto nopage;
 262                 }
 263         }
 264         spin_lock_irqsave(&page_alloc_lock, flags);
 265         RMQUEUE(order, (gfp_mask & GFP_DMA));
 266         spin_unlock_irqrestore(&page_alloc_lock, flags);
 267 nopage:
 268         return 0;
 269 }
 270
 271 /*
 272  * Show free area list (used inside shift_scroll-lock stuff)
 273  * We also calculate the percentage fragmentation. We do this by counting the
 274  * memory on each free list with the exception of the first item on the list.
 275  */
 276 void show_free_areas(void)
 277 {
 278         unsigned long order, flags;
 279         unsigned long total = 0;
 280
 281         printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
 282         printk("Free: %d (%d %d %d)\n",
 283                 nr_free_pages,
 284                 freepages.min,
 285                 freepages.low,
 286                 freepages.high);
 287         spin_lock_irqsave(&page_alloc_lock, flags);
 288         for (order=0 ; order < NR_MEM_LISTS; order++) {
 289                 struct page * tmp;
 290                 unsigned long nr = 0;
 291                 for (tmp = free_area[order].next ; tmp != memory_head(free_area+order) ; tmp = tmp->next) {
 292                         nr ++;
 293                 }
 294                 total += nr * ((PAGE_SIZE>>10) << order);
 295                 printk("%lu*%lukB ", nr, (unsigned long)((PAGE_SIZE>>10) << order));
 296         }
 297         spin_unlock_irqrestore(&page_alloc_lock, flags);
 298         printk("= %lukB)\n", total);
 299 #ifdef SWAP_CACHE_INFO
 300         show_swap_cache_info();
 301 #endif
 302 }
 303
 304 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 305
 306 /*
 307  * set up the free-area data structures:
 308  *   - mark all pages reserved
 309  *   - mark all memory queues empty
 310  *   - clear the memory bitmaps
 311  */
 312 __initfunc(unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem))
 313 {
 314         mem_map_t * p;
 315         unsigned long mask = PAGE_MASK;
 316         unsigned long i;
 317
 318         /*
 319          * Select nr of pages we try to keep free for important stuff
 320          * with a minimum of 10 pages and a maximum of 256 pages, so
 321          * that we don't waste too much memory on large systems.
 322          * This is fairly arbitrary, but based on some behaviour
 323          * analysis.
 324          */
 325         i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+7);
 326         if (i < 10)
 327                 i = 10;
 328         if (i > 256)
 329                 i = 256;
 330         freepages.min = i;
 331         freepages.low = i * 2;
 332         freepages.high = i * 3;
 333         mem_map = (mem_map_t *) LONG_ALIGN(start_mem);
 334         p = mem_map + MAP_NR(end_mem);
 335         start_mem = LONG_ALIGN((unsigned long) p);
 336         memset(mem_map, 0, start_mem - (unsigned long) mem_map);
 337         do {
 338                 --p;
 339                 atomic_set(&p->count, 0);
 340                 p->flags = (1 << PG_DMA) | (1 << PG_reserved);
 341                 p->map_nr = p - mem_map;
 342         } while (p > mem_map);
 343
 344         for (i = 0 ; i < NR_MEM_LISTS ; i++) {
 345                 unsigned long bitmap_size;
 346                 init_mem_queue(free_area+i);
 347                 mask += mask;
 348                 end_mem = (end_mem + ~mask) & mask;
 349                 bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
 350                 bitmap_size = (bitmap_size + 7) >> 3;
 351                 bitmap_size = LONG_ALIGN(bitmap_size);
 352                 free_area[i].map = (unsigned int *) start_mem;
 353                 memset((void *) start_mem, 0, bitmap_size);
 354                 start_mem += bitmap_size;
 355         }
 356         return start_mem;
 357 }
 358
 359 /*
 360  * The tests may look silly, but it essentially makes sure that
 361  * no other process did a swap-in on us just as we were waiting.
 362  *
 363  * Also, don't bother to add to the swap cache if this page-in
 364  * was due to a write access.
 365  */
 366 void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
 367         pte_t * page_table, unsigned long entry, int write_access)
 368 {
 369         unsigned long page;
 370         struct page *page_map;
 371
 372         page_map = read_swap_cache(entry);
 373
 374         if (pte_val(*page_table) != entry) {
 375                 if (page_map)
 376                         free_page_and_swap_cache(page_address(page_map));
 377                 return;
 378         }
 379         if (!page_map) {
 380                 set_pte(page_table, BAD_PAGE);
 381                 swap_free(entry);
 382                 oom(tsk);
 383                 return;
 384         }
 385
 386         page = page_address(page_map);
 387         vma->vm_mm->rss++;
 388         tsk->min_flt++;
 389         swap_free(entry);
 390
 391         if (!write_access || is_page_shared(page_map)) {
 392                 set_pte(page_table, mk_pte(page, vma->vm_page_prot));
 393                 return;
 394         }
 395
 396         /* The page is unshared, and we want write access.  In this
 397            case, it is safe to tear down the swap cache and give the
 398            page over entirely to this process. */
 399
 400         delete_from_swap_cache(page_map);
 401         set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
 402         return;
 403 }