Import 2.1.118
[davej-history.git] / mm / swapfile.c
blobb644b06800a1002c365b1a29d12a4c5d535a72e3
1 /*
2 * linux/mm/swapfile.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/mm.h>
9 #include <linux/smp.h>
10 #include <linux/smp_lock.h>
11 #include <linux/sched.h>
12 #include <linux/head.h>
13 #include <linux/kernel.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/stat.h>
18 #include <linux/swap.h>
19 #include <linux/fs.h>
20 #include <linux/swapctl.h>
21 #include <linux/malloc.h>
22 #include <linux/blkdev.h> /* for blk_size */
23 #include <linux/vmalloc.h>
24 #include <linux/pagemap.h>
25 #include <linux/shm.h>
27 #include <asm/bitops.h>
28 #include <asm/pgtable.h>
30 unsigned int nr_swapfiles = 0;
32 struct swap_list_t swap_list = {-1, -1};
34 struct swap_info_struct swap_info[MAX_SWAPFILES];
37 static inline int scan_swap_map(struct swap_info_struct *si)
39 unsigned long offset;
40 /*
41 * We try to cluster swap pages by allocating them
42 * sequentially in swap. Once we've allocated
43 * SWAP_CLUSTER_MAX pages this way, however, we resort to
44 * first-free allocation, starting a new cluster. This
45 * prevents us from scattering swap pages all over the entire
46 * swap partition, so that we reduce overall disk seek times
47 * between swap pages. -- sct */
48 if (si->cluster_nr) {
49 while (si->cluster_next <= si->highest_bit) {
50 offset = si->cluster_next++;
51 if (si->swap_map[offset])
52 continue;
53 if (test_bit(offset, si->swap_lockmap))
54 continue;
55 si->cluster_nr--;
56 goto got_page;
59 si->cluster_nr = SWAP_CLUSTER_MAX;
60 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
61 if (si->swap_map[offset])
62 continue;
63 if (test_bit(offset, si->swap_lockmap))
64 continue;
65 si->lowest_bit = offset;
66 got_page:
67 si->swap_map[offset] = 1;
68 nr_swap_pages--;
69 if (offset == si->highest_bit)
70 si->highest_bit--;
71 si->cluster_next = offset;
72 return offset;
74 return 0;
77 unsigned long get_swap_page(void)
79 struct swap_info_struct * p;
80 unsigned long offset, entry;
81 int type, wrapped = 0;
83 type = swap_list.next;
84 if (type < 0)
85 return 0;
86 if (nr_swap_pages == 0)
87 return 0;
89 while (1) {
90 p = &swap_info[type];
91 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
92 offset = scan_swap_map(p);
93 if (offset) {
94 entry = SWP_ENTRY(type,offset);
95 type = swap_info[type].next;
96 if (type < 0 ||
97 p->prio != swap_info[type].prio)
99 swap_list.next = swap_list.head;
101 else
103 swap_list.next = type;
105 return entry;
108 type = p->next;
109 if (!wrapped) {
110 if (type < 0 || p->prio != swap_info[type].prio) {
111 type = swap_list.head;
112 wrapped = 1;
114 } else if (type < 0) {
115 return 0; /* out of swap space */
121 void swap_free(unsigned long entry)
123 struct swap_info_struct * p;
124 unsigned long offset, type;
126 if (!entry)
127 goto out;
129 type = SWP_TYPE(entry);
130 if (type & SHM_SWP_TYPE)
131 goto out;
132 if (type >= nr_swapfiles)
133 goto bad_nofile;
134 p = & swap_info[type];
135 if (!(p->flags & SWP_USED))
136 goto bad_device;
137 if (p->prio > swap_info[swap_list.next].prio)
138 swap_list.next = swap_list.head;
139 offset = SWP_OFFSET(entry);
140 if (offset >= p->max)
141 goto bad_offset;
142 if (offset < p->lowest_bit)
143 p->lowest_bit = offset;
144 if (offset > p->highest_bit)
145 p->highest_bit = offset;
146 if (!p->swap_map[offset])
147 goto bad_free;
148 if (p->swap_map[offset] < SWAP_MAP_MAX) {
149 if (!--p->swap_map[offset])
150 nr_swap_pages++;
152 #ifdef DEBUG_SWAP
153 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
154 entry, p->swap_map[offset]);
155 #endif
156 out:
157 return;
159 bad_nofile:
160 printk("swap_free: Trying to free nonexistent swap-page\n");
161 goto out;
162 bad_device:
163 printk("swap_free: Trying to free swap from unused swap-device\n");
164 goto out;
165 bad_offset:
166 printk("swap_free: offset exceeds max\n");
167 goto out;
168 bad_free:
169 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
170 goto out;
174 * The swap entry has been read in advance, and we return 1 to indicate
175 * that the page has been used or is no longer needed.
177 * Always set the resulting pte to be nowrite (the same as COW pages
178 * after one process has exited). We don't know just how many PTEs will
179 * share this swap entry, so be cautious and let do_wp_page work out
180 * what to do if a write is requested later.
182 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
183 pte_t *dir, unsigned long entry, unsigned long page)
185 pte_t pte = *dir;
187 if (pte_none(pte))
188 return;
189 if (pte_present(pte)) {
190 /* If this entry is swap-cached, then page must already
191 hold the right address for any copies in physical
192 memory */
193 if (pte_page(pte) != page)
194 return;
195 /* We will be removing the swap cache in a moment, so... */
196 set_pte(dir, pte_mkdirty(pte));
197 return;
199 if (pte_val(pte) != entry)
200 return;
201 set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
202 swap_free(entry);
203 atomic_inc(&mem_map[MAP_NR(page)].count);
204 ++vma->vm_mm->rss;
207 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
208 unsigned long address, unsigned long size, unsigned long offset,
209 unsigned long entry, unsigned long page)
211 pte_t * pte;
212 unsigned long end;
214 if (pmd_none(*dir))
215 return;
216 if (pmd_bad(*dir)) {
217 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
218 pmd_clear(dir);
219 return;
221 pte = pte_offset(dir, address);
222 offset += address & PMD_MASK;
223 address &= ~PMD_MASK;
224 end = address + size;
225 if (end > PMD_SIZE)
226 end = PMD_SIZE;
227 do {
228 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
229 address += PAGE_SIZE;
230 pte++;
231 } while (address < end);
234 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
235 unsigned long address, unsigned long size,
236 unsigned long entry, unsigned long page)
238 pmd_t * pmd;
239 unsigned long offset, end;
241 if (pgd_none(*dir))
242 return;
243 if (pgd_bad(*dir)) {
244 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
245 pgd_clear(dir);
246 return;
248 pmd = pmd_offset(dir, address);
249 offset = address & PGDIR_MASK;
250 address &= ~PGDIR_MASK;
251 end = address + size;
252 if (end > PGDIR_SIZE)
253 end = PGDIR_SIZE;
254 do {
255 unuse_pmd(vma, pmd, address, end - address, offset, entry,
256 page);
257 address = (address + PMD_SIZE) & PMD_MASK;
258 pmd++;
259 } while (address < end);
262 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
263 unsigned long entry, unsigned long page)
265 unsigned long start = vma->vm_start, end = vma->vm_end;
267 while (start < end) {
268 unuse_pgd(vma, pgdir, start, end - start, entry, page);
269 start = (start + PGDIR_SIZE) & PGDIR_MASK;
270 pgdir++;
274 static void unuse_process(struct mm_struct * mm, unsigned long entry,
275 unsigned long page)
277 struct vm_area_struct* vma;
280 * Go through process' page directory.
282 if (!mm || mm == &init_mm)
283 return;
284 for (vma = mm->mmap; vma; vma = vma->vm_next) {
285 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
286 unuse_vma(vma, pgd, entry, page);
288 return;
292 * We completely avoid races by reading each swap page in advance,
293 * and then search for the process using it. All the necessary
294 * page table adjustments can then be made atomically.
296 static int try_to_unuse(unsigned int type)
298 struct swap_info_struct * si = &swap_info[type];
299 struct task_struct *p;
300 unsigned long page = 0;
301 struct page *page_map;
302 unsigned long entry;
303 int i;
305 while (1) {
307 * Find a swap page in use and read it in.
309 for (i = 1 , entry = 0; i < si->max ; i++) {
310 if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
311 entry = SWP_ENTRY(type, i);
312 break;
315 if (!entry)
316 break;
318 /* Get a page for the entry, using the existing swap
319 cache page if there is one. Otherwise, get a clean
320 page and read the swap into it. */
321 page_map = read_swap_cache(entry);
322 if (!page_map)
323 return -ENOMEM;
324 page = page_address(page_map);
325 read_lock(&tasklist_lock);
326 for_each_task(p)
327 unuse_process(p->mm, entry, page);
328 read_unlock(&tasklist_lock);
329 shm_unuse(entry, page);
330 /* Now get rid of the extra reference to the temporary
331 page we've been using. */
332 if (PageSwapCache(page_map))
333 delete_from_swap_cache(page_map);
334 free_page(page);
335 if (si->swap_map[i] != 0) {
336 if (si->swap_map[i] != SWAP_MAP_MAX)
337 printk("try_to_unuse: entry %08lx "
338 "not in use\n", entry);
339 si->swap_map[i] = 0;
340 nr_swap_pages++;
343 return 0;
346 asmlinkage int sys_swapoff(const char * specialfile)
348 struct swap_info_struct * p = NULL;
349 struct dentry * dentry;
350 struct file filp;
351 int i, type, prev;
352 int err = -EPERM;
354 lock_kernel();
355 if (!capable(CAP_SYS_ADMIN))
356 goto out;
358 dentry = namei(specialfile);
359 err = PTR_ERR(dentry);
360 if (IS_ERR(dentry))
361 goto out;
363 prev = -1;
364 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
365 p = swap_info + type;
366 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
367 if (p->swap_file) {
368 if (p->swap_file == dentry)
369 break;
370 } else {
371 if (S_ISBLK(dentry->d_inode->i_mode)
372 && (p->swap_device == dentry->d_inode->i_rdev))
373 break;
376 prev = type;
378 err = -EINVAL;
379 if (type < 0){
380 dput(dentry);
381 goto out;
383 if (prev < 0) {
384 swap_list.head = p->next;
385 } else {
386 swap_info[prev].next = p->next;
388 if (type == swap_list.next) {
389 /* just pick something that's safe... */
390 swap_list.next = swap_list.head;
392 p->flags = SWP_USED;
393 err = try_to_unuse(type);
394 if (err) {
395 dput(dentry);
396 /* re-insert swap space back into swap_list */
397 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
398 if (p->prio >= swap_info[i].prio)
399 break;
400 p->next = i;
401 if (prev < 0)
402 swap_list.head = swap_list.next = p - swap_info;
403 else
404 swap_info[prev].next = p - swap_info;
405 p->flags = SWP_WRITEOK;
406 goto out;
408 if(p->swap_device){
409 memset(&filp, 0, sizeof(filp));
410 filp.f_dentry = dentry;
411 filp.f_mode = 3; /* read write */
412 /* open it again to get fops */
413 if( !blkdev_open(dentry->d_inode, &filp) &&
414 filp.f_op && filp.f_op->release){
415 filp.f_op->release(dentry->d_inode,&filp);
416 filp.f_op->release(dentry->d_inode,&filp);
419 dput(dentry);
421 nr_swap_pages -= p->pages;
422 dput(p->swap_file);
423 p->swap_file = NULL;
424 p->swap_device = 0;
425 vfree(p->swap_map);
426 p->swap_map = NULL;
427 vfree(p->swap_lockmap);
428 p->swap_lockmap = NULL;
429 p->flags = 0;
430 err = 0;
431 out:
432 unlock_kernel();
433 return err;
436 int get_swaparea_info(char *buf)
438 char * page = (char *) __get_free_page(GFP_KERNEL);
439 struct swap_info_struct *ptr = swap_info;
440 int i, j, len = 0, usedswap;
442 if (!page)
443 return -ENOMEM;
445 len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
446 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
447 if (ptr->flags & SWP_USED) {
448 char * path = d_path(ptr->swap_file, page, PAGE_SIZE);
450 len += sprintf(buf + len, "%-31s ", path);
452 if (!ptr->swap_device)
453 len += sprintf(buf + len, "file\t\t");
454 else
455 len += sprintf(buf + len, "partition\t");
457 usedswap = 0;
458 for (j = 0; j < ptr->max; ++j)
459 switch (ptr->swap_map[j]) {
460 case SWAP_MAP_BAD:
461 case 0:
462 continue;
463 default:
464 usedswap++;
466 len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
467 usedswap << (PAGE_SHIFT - 10), ptr->prio);
470 free_page((unsigned long) page);
471 return len;
475 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
477 * The swapon system call
479 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
481 struct swap_info_struct * p;
482 struct dentry * swap_dentry;
483 unsigned int type;
484 int i, j, prev;
485 int error = -EPERM;
486 struct file filp;
487 static int least_priority = 0;
488 union swap_header *swap_header = 0;
489 int swap_header_version;
490 int lock_map_size = PAGE_SIZE;
491 int nr_good_pages = 0;
492 char tmp_lock_map = 0;
494 lock_kernel();
495 if (!capable(CAP_SYS_ADMIN))
496 goto out;
497 memset(&filp, 0, sizeof(filp));
498 p = swap_info;
499 for (type = 0 ; type < nr_swapfiles ; type++,p++)
500 if (!(p->flags & SWP_USED))
501 break;
502 if (type >= MAX_SWAPFILES)
503 goto out;
504 if (type >= nr_swapfiles)
505 nr_swapfiles = type+1;
506 p->flags = SWP_USED;
507 p->swap_file = NULL;
508 p->swap_device = 0;
509 p->swap_map = NULL;
510 p->swap_lockmap = NULL;
511 p->lowest_bit = 0;
512 p->highest_bit = 0;
513 p->cluster_nr = 0;
514 p->max = 1;
515 p->next = -1;
516 if (swap_flags & SWAP_FLAG_PREFER) {
517 p->prio =
518 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
519 } else {
520 p->prio = --least_priority;
522 swap_dentry = namei(specialfile);
523 error = PTR_ERR(swap_dentry);
524 if (IS_ERR(swap_dentry))
525 goto bad_swap_2;
527 p->swap_file = swap_dentry;
528 error = -EINVAL;
530 if (S_ISBLK(swap_dentry->d_inode->i_mode)) {
531 p->swap_device = swap_dentry->d_inode->i_rdev;
532 set_blocksize(p->swap_device, PAGE_SIZE);
534 filp.f_dentry = swap_dentry;
535 filp.f_mode = 3; /* read write */
536 error = blkdev_open(swap_dentry->d_inode, &filp);
537 if (error)
538 goto bad_swap_2;
539 set_blocksize(p->swap_device, PAGE_SIZE);
540 error = -ENODEV;
541 if (!p->swap_device ||
542 (blk_size[MAJOR(p->swap_device)] &&
543 !blk_size[MAJOR(p->swap_device)][MINOR(p->swap_device)]))
544 goto bad_swap;
545 error = -EBUSY;
546 for (i = 0 ; i < nr_swapfiles ; i++) {
547 if (i == type)
548 continue;
549 if (p->swap_device == swap_info[i].swap_device)
550 goto bad_swap;
552 } else if (!S_ISREG(swap_dentry->d_inode->i_mode))
553 goto bad_swap;
554 swap_header = (void *) __get_free_page(GFP_USER);
555 if (!swap_header) {
556 printk("Unable to start swapping: out of memory :-)\n");
557 error = -ENOMEM;
558 goto bad_swap;
561 p->swap_lockmap = &tmp_lock_map;
562 rw_swap_page_nocache(READ, SWP_ENTRY(type,0), (char *) swap_header);
563 p->swap_lockmap = 0;
565 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
566 swap_header_version = 1;
567 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
568 swap_header_version = 2;
569 else {
570 printk("Unable to find swap-space signature\n");
571 error = -EINVAL;
572 goto bad_swap;
575 switch (swap_header_version) {
576 case 1:
577 memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
578 j = 0;
579 p->lowest_bit = 0;
580 p->highest_bit = 0;
581 for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
582 if (test_bit(i,(char *) swap_header)) {
583 if (!p->lowest_bit)
584 p->lowest_bit = i;
585 p->highest_bit = i;
586 p->max = i+1;
587 j++;
590 nr_good_pages = j;
591 p->swap_map = vmalloc(p->max * sizeof(short));
592 if (!p->swap_map) {
593 error = -ENOMEM;
594 goto bad_swap;
596 for (i = 1 ; i < p->max ; i++) {
597 if (test_bit(i,(char *) swap_header))
598 p->swap_map[i] = 0;
599 else
600 p->swap_map[i] = SWAP_MAP_BAD;
602 break;
604 case 2:
605 /* Check the swap header's sub-version and the size of
606 the swap file and bad block lists */
607 if (swap_header->info.version != 1) {
608 printk(KERN_WARNING
609 "Unable to handle swap header version %d\n",
610 swap_header->info.version);
611 error = -EINVAL;
612 goto bad_swap;
615 p->lowest_bit = 1;
616 p->highest_bit = swap_header->info.last_page - 1;
617 p->max = swap_header->info.last_page;
619 if (p->max >= 0x7fffffffL/PAGE_SIZE ||
620 (void *) &swap_header->info.badpages[swap_header->info.nr_badpages-1] >= (void *) swap_header->magic.magic) {
621 error = -EINVAL;
622 goto bad_swap;
625 /* OK, set up the swap map and apply the bad block list */
626 if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
627 error = -ENOMEM;
628 goto bad_swap;
631 error = 0;
632 memset(p->swap_map, 0, p->max * sizeof(short));
633 for (i=0; i<swap_header->info.nr_badpages; i++) {
634 int page = swap_header->info.badpages[i];
635 if (page <= 0 || page >= swap_header->info.last_page)
636 error = -EINVAL;
637 else
638 p->swap_map[page] = SWAP_MAP_BAD;
640 nr_good_pages = swap_header->info.last_page - i;
641 lock_map_size = (p->max + 7) / 8;
642 if (error)
643 goto bad_swap;
646 if (!nr_good_pages) {
647 printk(KERN_WARNING "Empty swap-file\n");
648 error = -EINVAL;
649 goto bad_swap;
651 p->swap_map[0] = SWAP_MAP_BAD;
652 if (!(p->swap_lockmap = vmalloc (lock_map_size))) {
653 error = -ENOMEM;
654 goto bad_swap;
656 memset(p->swap_lockmap,0,lock_map_size);
657 p->flags = SWP_WRITEOK;
658 p->pages = nr_good_pages;
659 nr_swap_pages += nr_good_pages;
660 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
661 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
663 /* insert swap space into swap_list: */
664 prev = -1;
665 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
666 if (p->prio >= swap_info[i].prio) {
667 break;
669 prev = i;
671 p->next = i;
672 if (prev < 0) {
673 swap_list.head = swap_list.next = p - swap_info;
674 } else {
675 swap_info[prev].next = p - swap_info;
677 error = 0;
678 goto out;
679 bad_swap:
680 if(filp.f_op && filp.f_op->release)
681 filp.f_op->release(filp.f_dentry->d_inode,&filp);
682 bad_swap_2:
683 if (p->swap_lockmap)
684 vfree(p->swap_lockmap);
685 if (p->swap_map)
686 vfree(p->swap_map);
687 dput(p->swap_file);
688 p->swap_device = 0;
689 p->swap_file = NULL;
690 p->swap_map = NULL;
691 p->swap_lockmap = NULL;
692 p->flags = 0;
693 out:
694 if (swap_header)
695 free_page((long) swap_header);
696 unlock_kernel();
697 return error;
700 void si_swapinfo(struct sysinfo *val)
702 unsigned int i, j;
704 val->freeswap = val->totalswap = 0;
705 for (i = 0; i < nr_swapfiles; i++) {
706 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
707 continue;
708 for (j = 0; j < swap_info[i].max; ++j)
709 switch (swap_info[i].swap_map[j]) {
710 case SWAP_MAP_BAD:
711 continue;
712 case 0:
713 ++val->freeswap;
714 default:
715 ++val->totalswap;
718 val->freeswap <<= PAGE_SHIFT;
719 val->totalswap <<= PAGE_SHIFT;
720 return;