Import 2.2.7
[davej-history.git] / mm / swapfile.c
blob42ca4900aeedf3c6dd57df0c1722b53db5645e81
1 /*
2 * linux/mm/swapfile.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/malloc.h>
9 #include <linux/smp_lock.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/blkdev.h> /* for blk_size */
14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h>
16 #include <linux/shm.h>
18 #include <asm/pgtable.h>
20 unsigned int nr_swapfiles = 0;
22 struct swap_list_t swap_list = {-1, -1};
24 struct swap_info_struct swap_info[MAX_SWAPFILES];
26 #define SWAPFILE_CLUSTER 256
28 static inline int scan_swap_map(struct swap_info_struct *si)
30 unsigned long offset;
31 /*
32 * We try to cluster swap pages by allocating them
33 * sequentially in swap. Once we've allocated
34 * SWAPFILE_CLUSTER pages this way, however, we resort to
35 * first-free allocation, starting a new cluster. This
36 * prevents us from scattering swap pages all over the entire
37 * swap partition, so that we reduce overall disk seek times
38 * between swap pages. -- sct */
39 if (si->cluster_nr) {
40 while (si->cluster_next <= si->highest_bit) {
41 offset = si->cluster_next++;
42 if (si->swap_map[offset])
43 continue;
44 if (test_bit(offset, si->swap_lockmap))
45 continue;
46 si->cluster_nr--;
47 goto got_page;
50 si->cluster_nr = SWAPFILE_CLUSTER;
51 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
52 if (si->swap_map[offset])
53 continue;
54 if (test_bit(offset, si->swap_lockmap))
55 continue;
56 si->lowest_bit = offset;
57 got_page:
58 si->swap_map[offset] = 1;
59 nr_swap_pages--;
60 if (offset == si->highest_bit)
61 si->highest_bit--;
62 si->cluster_next = offset;
63 return offset;
65 return 0;
68 unsigned long get_swap_page(void)
70 struct swap_info_struct * p;
71 unsigned long offset, entry;
72 int type, wrapped = 0;
74 type = swap_list.next;
75 if (type < 0)
76 return 0;
77 if (nr_swap_pages == 0)
78 return 0;
80 while (1) {
81 p = &swap_info[type];
82 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
83 offset = scan_swap_map(p);
84 if (offset) {
85 entry = SWP_ENTRY(type,offset);
86 type = swap_info[type].next;
87 if (type < 0 ||
88 p->prio != swap_info[type].prio)
90 swap_list.next = swap_list.head;
92 else
94 swap_list.next = type;
96 return entry;
99 type = p->next;
100 if (!wrapped) {
101 if (type < 0 || p->prio != swap_info[type].prio) {
102 type = swap_list.head;
103 wrapped = 1;
105 } else if (type < 0) {
106 return 0; /* out of swap space */
112 void swap_free(unsigned long entry)
114 struct swap_info_struct * p;
115 unsigned long offset, type;
117 if (!entry)
118 goto out;
120 type = SWP_TYPE(entry);
121 if (type & SHM_SWP_TYPE)
122 goto out;
123 if (type >= nr_swapfiles)
124 goto bad_nofile;
125 p = & swap_info[type];
126 if (!(p->flags & SWP_USED))
127 goto bad_device;
128 if (p->prio > swap_info[swap_list.next].prio)
129 swap_list.next = swap_list.head;
130 offset = SWP_OFFSET(entry);
131 if (offset >= p->max)
132 goto bad_offset;
133 if (offset < p->lowest_bit)
134 p->lowest_bit = offset;
135 if (offset > p->highest_bit)
136 p->highest_bit = offset;
137 if (!p->swap_map[offset])
138 goto bad_free;
139 if (p->swap_map[offset] < SWAP_MAP_MAX) {
140 if (!--p->swap_map[offset])
141 nr_swap_pages++;
143 #ifdef DEBUG_SWAP
144 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
145 entry, p->swap_map[offset]);
146 #endif
147 out:
148 return;
150 bad_nofile:
151 printk("swap_free: Trying to free nonexistent swap-page\n");
152 goto out;
153 bad_device:
154 printk("swap_free: Trying to free swap from unused swap-device\n");
155 goto out;
156 bad_offset:
157 printk("swap_free: offset exceeds max\n");
158 goto out;
159 bad_free:
160 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
161 goto out;
165 * The swap entry has been read in advance, and we return 1 to indicate
166 * that the page has been used or is no longer needed.
168 * Always set the resulting pte to be nowrite (the same as COW pages
169 * after one process has exited). We don't know just how many PTEs will
170 * share this swap entry, so be cautious and let do_wp_page work out
171 * what to do if a write is requested later.
173 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
174 pte_t *dir, unsigned long entry, unsigned long page)
176 pte_t pte = *dir;
178 if (pte_none(pte))
179 return;
180 if (pte_present(pte)) {
181 /* If this entry is swap-cached, then page must already
182 hold the right address for any copies in physical
183 memory */
184 if (pte_page(pte) != page)
185 return;
186 /* We will be removing the swap cache in a moment, so... */
187 set_pte(dir, pte_mkdirty(pte));
188 return;
190 if (pte_val(pte) != entry)
191 return;
192 set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
193 swap_free(entry);
194 atomic_inc(&mem_map[MAP_NR(page)].count);
195 ++vma->vm_mm->rss;
198 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
199 unsigned long address, unsigned long size, unsigned long offset,
200 unsigned long entry, unsigned long page)
202 pte_t * pte;
203 unsigned long end;
205 if (pmd_none(*dir))
206 return;
207 if (pmd_bad(*dir)) {
208 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
209 pmd_clear(dir);
210 return;
212 pte = pte_offset(dir, address);
213 offset += address & PMD_MASK;
214 address &= ~PMD_MASK;
215 end = address + size;
216 if (end > PMD_SIZE)
217 end = PMD_SIZE;
218 do {
219 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
220 address += PAGE_SIZE;
221 pte++;
222 } while (address < end);
225 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
226 unsigned long address, unsigned long size,
227 unsigned long entry, unsigned long page)
229 pmd_t * pmd;
230 unsigned long offset, end;
232 if (pgd_none(*dir))
233 return;
234 if (pgd_bad(*dir)) {
235 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
236 pgd_clear(dir);
237 return;
239 pmd = pmd_offset(dir, address);
240 offset = address & PGDIR_MASK;
241 address &= ~PGDIR_MASK;
242 end = address + size;
243 if (end > PGDIR_SIZE)
244 end = PGDIR_SIZE;
245 do {
246 unuse_pmd(vma, pmd, address, end - address, offset, entry,
247 page);
248 address = (address + PMD_SIZE) & PMD_MASK;
249 pmd++;
250 } while (address < end);
253 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
254 unsigned long entry, unsigned long page)
256 unsigned long start = vma->vm_start, end = vma->vm_end;
258 while (start < end) {
259 unuse_pgd(vma, pgdir, start, end - start, entry, page);
260 start = (start + PGDIR_SIZE) & PGDIR_MASK;
261 pgdir++;
265 static void unuse_process(struct mm_struct * mm, unsigned long entry,
266 unsigned long page)
268 struct vm_area_struct* vma;
271 * Go through process' page directory.
273 if (!mm || mm == &init_mm)
274 return;
275 for (vma = mm->mmap; vma; vma = vma->vm_next) {
276 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
277 unuse_vma(vma, pgd, entry, page);
279 return;
283 * We completely avoid races by reading each swap page in advance,
284 * and then search for the process using it. All the necessary
285 * page table adjustments can then be made atomically.
287 static int try_to_unuse(unsigned int type)
289 struct swap_info_struct * si = &swap_info[type];
290 struct task_struct *p;
291 struct page *page_map;
292 unsigned long entry, page;
293 int i;
295 while (1) {
297 * Find a swap page in use and read it in.
299 for (i = 1; i < si->max ; i++) {
300 if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
301 goto found_entry;
304 break;
306 found_entry:
307 entry = SWP_ENTRY(type, i);
309 /* Get a page for the entry, using the existing swap
310 cache page if there is one. Otherwise, get a clean
311 page and read the swap into it. */
312 page_map = read_swap_cache(entry);
313 if (!page_map) {
315 * Continue searching if the entry became unused.
317 if (si->swap_map[i] == 0)
318 continue;
319 return -ENOMEM;
321 page = page_address(page_map);
322 read_lock(&tasklist_lock);
323 for_each_task(p)
324 unuse_process(p->mm, entry, page);
325 read_unlock(&tasklist_lock);
326 shm_unuse(entry, page);
327 /* Now get rid of the extra reference to the temporary
328 page we've been using. */
329 if (PageSwapCache(page_map))
330 delete_from_swap_cache(page_map);
331 __free_page(page_map);
333 * Check for and clear any overflowed swap map counts.
335 if (si->swap_map[i] != 0) {
336 if (si->swap_map[i] != SWAP_MAP_MAX)
337 printk(KERN_ERR
338 "try_to_unuse: entry %08lx count=%d\n",
339 entry, si->swap_map[i]);
340 si->swap_map[i] = 0;
341 nr_swap_pages++;
344 return 0;
347 asmlinkage int sys_swapoff(const char * specialfile)
349 struct swap_info_struct * p = NULL;
350 struct dentry * dentry;
351 struct file filp;
352 int i, type, prev;
353 int err = -EPERM;
355 lock_kernel();
356 if (!capable(CAP_SYS_ADMIN))
357 goto out;
359 dentry = namei(specialfile);
360 err = PTR_ERR(dentry);
361 if (IS_ERR(dentry))
362 goto out;
364 prev = -1;
365 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
366 p = swap_info + type;
367 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
368 if (p->swap_file) {
369 if (p->swap_file == dentry)
370 break;
371 } else {
372 if (S_ISBLK(dentry->d_inode->i_mode)
373 && (p->swap_device == dentry->d_inode->i_rdev))
374 break;
377 prev = type;
379 err = -EINVAL;
380 if (type < 0)
381 goto out_dput;
383 if (prev < 0) {
384 swap_list.head = p->next;
385 } else {
386 swap_info[prev].next = p->next;
388 if (type == swap_list.next) {
389 /* just pick something that's safe... */
390 swap_list.next = swap_list.head;
392 p->flags = SWP_USED;
393 err = try_to_unuse(type);
394 if (err) {
395 /* re-insert swap space back into swap_list */
396 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
397 if (p->prio >= swap_info[i].prio)
398 break;
399 p->next = i;
400 if (prev < 0)
401 swap_list.head = swap_list.next = p - swap_info;
402 else
403 swap_info[prev].next = p - swap_info;
404 p->flags = SWP_WRITEOK;
405 goto out_dput;
407 if(p->swap_device){
408 memset(&filp, 0, sizeof(filp));
409 filp.f_dentry = dentry;
410 filp.f_mode = 3; /* read write */
411 /* open it again to get fops */
412 if( !blkdev_open(dentry->d_inode, &filp) &&
413 filp.f_op && filp.f_op->release){
414 filp.f_op->release(dentry->d_inode,&filp);
415 filp.f_op->release(dentry->d_inode,&filp);
418 dput(dentry);
420 dentry = p->swap_file;
421 p->swap_file = NULL;
422 nr_swap_pages -= p->pages;
423 p->swap_device = 0;
424 vfree(p->swap_map);
425 p->swap_map = NULL;
426 vfree(p->swap_lockmap);
427 p->swap_lockmap = NULL;
428 p->flags = 0;
429 err = 0;
431 out_dput:
432 dput(dentry);
433 out:
434 unlock_kernel();
435 return err;
438 int get_swaparea_info(char *buf)
440 char * page = (char *) __get_free_page(GFP_KERNEL);
441 struct swap_info_struct *ptr = swap_info;
442 int i, j, len = 0, usedswap;
444 if (!page)
445 return -ENOMEM;
447 len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
448 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
449 if (ptr->flags & SWP_USED) {
450 char * path = d_path(ptr->swap_file, page, PAGE_SIZE);
452 len += sprintf(buf + len, "%-31s ", path);
454 if (!ptr->swap_device)
455 len += sprintf(buf + len, "file\t\t");
456 else
457 len += sprintf(buf + len, "partition\t");
459 usedswap = 0;
460 for (j = 0; j < ptr->max; ++j)
461 switch (ptr->swap_map[j]) {
462 case SWAP_MAP_BAD:
463 case 0:
464 continue;
465 default:
466 usedswap++;
468 len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
469 usedswap << (PAGE_SHIFT - 10), ptr->prio);
472 free_page((unsigned long) page);
473 return len;
477 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
479 * The swapon system call
481 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
483 struct swap_info_struct * p;
484 struct dentry * swap_dentry;
485 unsigned int type;
486 int i, j, prev;
487 int error = -EPERM;
488 struct file filp;
489 static int least_priority = 0;
490 union swap_header *swap_header = 0;
491 int swap_header_version;
492 int lock_map_size = PAGE_SIZE;
493 int nr_good_pages = 0;
494 unsigned long tmp_lock_map = 0;
496 lock_kernel();
497 if (!capable(CAP_SYS_ADMIN))
498 goto out;
499 memset(&filp, 0, sizeof(filp));
500 p = swap_info;
501 for (type = 0 ; type < nr_swapfiles ; type++,p++)
502 if (!(p->flags & SWP_USED))
503 break;
504 if (type >= MAX_SWAPFILES)
505 goto out;
506 if (type >= nr_swapfiles)
507 nr_swapfiles = type+1;
508 p->flags = SWP_USED;
509 p->swap_file = NULL;
510 p->swap_device = 0;
511 p->swap_map = NULL;
512 p->swap_lockmap = NULL;
513 p->lowest_bit = 0;
514 p->highest_bit = 0;
515 p->cluster_nr = 0;
516 p->max = 1;
517 p->next = -1;
518 if (swap_flags & SWAP_FLAG_PREFER) {
519 p->prio =
520 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
521 } else {
522 p->prio = --least_priority;
524 swap_dentry = namei(specialfile);
525 error = PTR_ERR(swap_dentry);
526 if (IS_ERR(swap_dentry))
527 goto bad_swap_2;
529 p->swap_file = swap_dentry;
530 error = -EINVAL;
532 if (S_ISBLK(swap_dentry->d_inode->i_mode)) {
533 p->swap_device = swap_dentry->d_inode->i_rdev;
534 set_blocksize(p->swap_device, PAGE_SIZE);
536 filp.f_dentry = swap_dentry;
537 filp.f_mode = 3; /* read write */
538 error = blkdev_open(swap_dentry->d_inode, &filp);
539 if (error)
540 goto bad_swap_2;
541 set_blocksize(p->swap_device, PAGE_SIZE);
542 error = -ENODEV;
543 if (!p->swap_device ||
544 (blk_size[MAJOR(p->swap_device)] &&
545 !blk_size[MAJOR(p->swap_device)][MINOR(p->swap_device)]))
546 goto bad_swap;
547 error = -EBUSY;
548 for (i = 0 ; i < nr_swapfiles ; i++) {
549 if (i == type)
550 continue;
551 if (p->swap_device == swap_info[i].swap_device)
552 goto bad_swap;
554 } else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
555 error = -EBUSY;
556 for (i = 0 ; i < nr_swapfiles ; i++) {
557 if (i == type)
558 continue;
559 if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode)
560 goto bad_swap;
562 } else
563 goto bad_swap;
565 swap_header = (void *) __get_free_page(GFP_USER);
566 if (!swap_header) {
567 printk("Unable to start swapping: out of memory :-)\n");
568 error = -ENOMEM;
569 goto bad_swap;
572 p->swap_lockmap = (char *) &tmp_lock_map;
573 rw_swap_page_nocache(READ, SWP_ENTRY(type,0), (char *) swap_header);
574 p->swap_lockmap = NULL;
576 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
577 swap_header_version = 1;
578 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
579 swap_header_version = 2;
580 else {
581 printk("Unable to find swap-space signature\n");
582 error = -EINVAL;
583 goto bad_swap;
586 switch (swap_header_version) {
587 case 1:
588 memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
589 j = 0;
590 p->lowest_bit = 0;
591 p->highest_bit = 0;
592 for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
593 if (test_bit(i,(char *) swap_header)) {
594 if (!p->lowest_bit)
595 p->lowest_bit = i;
596 p->highest_bit = i;
597 p->max = i+1;
598 j++;
601 nr_good_pages = j;
602 p->swap_map = vmalloc(p->max * sizeof(short));
603 if (!p->swap_map) {
604 error = -ENOMEM;
605 goto bad_swap;
607 for (i = 1 ; i < p->max ; i++) {
608 if (test_bit(i,(char *) swap_header))
609 p->swap_map[i] = 0;
610 else
611 p->swap_map[i] = SWAP_MAP_BAD;
613 break;
615 case 2:
616 /* Check the swap header's sub-version and the size of
617 the swap file and bad block lists */
618 if (swap_header->info.version != 1) {
619 printk(KERN_WARNING
620 "Unable to handle swap header version %d\n",
621 swap_header->info.version);
622 error = -EINVAL;
623 goto bad_swap;
626 p->lowest_bit = 1;
627 p->highest_bit = swap_header->info.last_page - 1;
628 p->max = swap_header->info.last_page;
630 error = -EINVAL;
631 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
632 goto bad_swap;
633 if (p->max >= SWP_OFFSET(SWP_ENTRY(0,~0UL)))
634 goto bad_swap;
636 /* OK, set up the swap map and apply the bad block list */
637 if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
638 error = -ENOMEM;
639 goto bad_swap;
642 error = 0;
643 memset(p->swap_map, 0, p->max * sizeof(short));
644 for (i=0; i<swap_header->info.nr_badpages; i++) {
645 int page = swap_header->info.badpages[i];
646 if (page <= 0 || page >= swap_header->info.last_page)
647 error = -EINVAL;
648 else
649 p->swap_map[page] = SWAP_MAP_BAD;
651 nr_good_pages = swap_header->info.last_page - i;
652 lock_map_size = (p->max + 7) / 8;
653 if (error)
654 goto bad_swap;
657 if (!nr_good_pages) {
658 printk(KERN_WARNING "Empty swap-file\n");
659 error = -EINVAL;
660 goto bad_swap;
662 p->swap_map[0] = SWAP_MAP_BAD;
663 if (!(p->swap_lockmap = vmalloc (lock_map_size))) {
664 error = -ENOMEM;
665 goto bad_swap;
667 memset(p->swap_lockmap,0,lock_map_size);
668 p->flags = SWP_WRITEOK;
669 p->pages = nr_good_pages;
670 nr_swap_pages += nr_good_pages;
671 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
672 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
674 /* insert swap space into swap_list: */
675 prev = -1;
676 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
677 if (p->prio >= swap_info[i].prio) {
678 break;
680 prev = i;
682 p->next = i;
683 if (prev < 0) {
684 swap_list.head = swap_list.next = p - swap_info;
685 } else {
686 swap_info[prev].next = p - swap_info;
688 error = 0;
689 goto out;
690 bad_swap:
691 if(filp.f_op && filp.f_op->release)
692 filp.f_op->release(filp.f_dentry->d_inode,&filp);
693 bad_swap_2:
694 if (p->swap_lockmap)
695 vfree(p->swap_lockmap);
696 if (p->swap_map)
697 vfree(p->swap_map);
698 dput(p->swap_file);
699 p->swap_device = 0;
700 p->swap_file = NULL;
701 p->swap_map = NULL;
702 p->swap_lockmap = NULL;
703 p->flags = 0;
704 if (!(swap_flags & SWAP_FLAG_PREFER))
705 ++least_priority;
706 out:
707 if (swap_header)
708 free_page((long) swap_header);
709 unlock_kernel();
710 return error;
713 void si_swapinfo(struct sysinfo *val)
715 unsigned int i, j;
717 val->freeswap = val->totalswap = 0;
718 for (i = 0; i < nr_swapfiles; i++) {
719 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
720 continue;
721 for (j = 0; j < swap_info[i].max; ++j)
722 switch (swap_info[i].swap_map[j]) {
723 case SWAP_MAP_BAD:
724 continue;
725 case 0:
726 ++val->freeswap;
727 default:
728 ++val->totalswap;
731 val->freeswap <<= PAGE_SHIFT;
732 val->totalswap <<= PAGE_SHIFT;
733 return;