Import 2.3.8pre3
[davej-history.git] / mm / swapfile.c
bloba4a523ef25cb0ce182d067f23a6f40324c262dbe
1 /*
2 * linux/mm/swapfile.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/config.h>
9 #include <linux/malloc.h>
10 #include <linux/smp_lock.h>
11 #include <linux/kernel_stat.h>
12 #include <linux/swap.h>
13 #include <linux/swapctl.h>
14 #include <linux/blkdev.h> /* for blk_size */
15 #include <linux/vmalloc.h>
16 #include <linux/pagemap.h>
17 #include <linux/shm.h>
19 #include <asm/pgtable.h>
21 unsigned int nr_swapfiles = 0;
23 struct swap_list_t swap_list = {-1, -1};
25 struct swap_info_struct swap_info[MAX_SWAPFILES];
27 #define SWAPFILE_CLUSTER 256
29 static inline int scan_swap_map(struct swap_info_struct *si)
31 unsigned long offset;
32 /*
33 * We try to cluster swap pages by allocating them
34 * sequentially in swap. Once we've allocated
35 * SWAPFILE_CLUSTER pages this way, however, we resort to
36 * first-free allocation, starting a new cluster. This
37 * prevents us from scattering swap pages all over the entire
38 * swap partition, so that we reduce overall disk seek times
39 * between swap pages. -- sct */
40 if (si->cluster_nr) {
41 while (si->cluster_next <= si->highest_bit) {
42 offset = si->cluster_next++;
43 if (si->swap_map[offset])
44 continue;
45 si->cluster_nr--;
46 goto got_page;
49 si->cluster_nr = SWAPFILE_CLUSTER;
50 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
51 if (si->swap_map[offset])
52 continue;
53 si->lowest_bit = offset;
54 got_page:
55 si->swap_map[offset] = 1;
56 nr_swap_pages--;
57 if (offset == si->highest_bit)
58 si->highest_bit--;
59 si->cluster_next = offset;
60 return offset;
62 return 0;
65 unsigned long get_swap_page(void)
67 struct swap_info_struct * p;
68 unsigned long offset, entry;
69 int type, wrapped = 0;
71 type = swap_list.next;
72 if (type < 0)
73 return 0;
74 if (nr_swap_pages == 0)
75 return 0;
77 while (1) {
78 p = &swap_info[type];
79 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
80 offset = scan_swap_map(p);
81 if (offset) {
82 entry = SWP_ENTRY(type,offset);
83 type = swap_info[type].next;
84 if (type < 0 ||
85 p->prio != swap_info[type].prio)
87 swap_list.next = swap_list.head;
89 else
91 swap_list.next = type;
93 return entry;
96 type = p->next;
97 if (!wrapped) {
98 if (type < 0 || p->prio != swap_info[type].prio) {
99 type = swap_list.head;
100 wrapped = 1;
102 } else if (type < 0) {
103 return 0; /* out of swap space */
109 void swap_free(unsigned long entry)
111 struct swap_info_struct * p;
112 unsigned long offset, type;
114 if (!entry)
115 goto out;
117 type = SWP_TYPE(entry);
118 if (type & SHM_SWP_TYPE)
119 goto out;
120 if (type >= nr_swapfiles)
121 goto bad_nofile;
122 p = & swap_info[type];
123 if (!(p->flags & SWP_USED))
124 goto bad_device;
125 if (p->prio > swap_info[swap_list.next].prio)
126 swap_list.next = swap_list.head;
127 offset = SWP_OFFSET(entry);
128 if (offset >= p->max)
129 goto bad_offset;
130 if (offset < p->lowest_bit)
131 p->lowest_bit = offset;
132 if (offset > p->highest_bit)
133 p->highest_bit = offset;
134 if (!p->swap_map[offset])
135 goto bad_free;
136 if (p->swap_map[offset] < SWAP_MAP_MAX) {
137 if (!--p->swap_map[offset])
138 nr_swap_pages++;
140 #ifdef DEBUG_SWAP
141 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
142 entry, p->swap_map[offset]);
143 #endif
144 out:
145 return;
147 bad_nofile:
148 printk("swap_free: Trying to free nonexistent swap-page\n");
149 goto out;
150 bad_device:
151 printk("swap_free: Trying to free swap from unused swap-device\n");
152 goto out;
153 bad_offset:
154 printk("swap_free: offset exceeds max\n");
155 goto out;
156 bad_free:
157 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
158 goto out;
162 * The swap entry has been read in advance, and we return 1 to indicate
163 * that the page has been used or is no longer needed.
165 * Always set the resulting pte to be nowrite (the same as COW pages
166 * after one process has exited). We don't know just how many PTEs will
167 * share this swap entry, so be cautious and let do_wp_page work out
168 * what to do if a write is requested later.
170 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
171 pte_t *dir, unsigned long entry, unsigned long page)
173 pte_t pte = *dir;
175 if (pte_none(pte))
176 return;
177 if (pte_present(pte)) {
178 /* If this entry is swap-cached, then page must already
179 hold the right address for any copies in physical
180 memory */
181 if (pte_page(pte) != page)
182 return;
183 /* We will be removing the swap cache in a moment, so... */
184 set_pte(dir, pte_mkdirty(pte));
185 return;
187 if (pte_val(pte) != entry)
188 return;
189 set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
190 swap_free(entry);
191 get_page(mem_map + MAP_NR(page));
192 ++vma->vm_mm->rss;
195 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
196 unsigned long address, unsigned long size, unsigned long offset,
197 unsigned long entry, unsigned long page)
199 pte_t * pte;
200 unsigned long end;
202 if (pmd_none(*dir))
203 return;
204 if (pmd_bad(*dir)) {
205 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
206 pmd_clear(dir);
207 return;
209 pte = pte_offset(dir, address);
210 offset += address & PMD_MASK;
211 address &= ~PMD_MASK;
212 end = address + size;
213 if (end > PMD_SIZE)
214 end = PMD_SIZE;
215 do {
216 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
217 address += PAGE_SIZE;
218 pte++;
219 } while (address < end);
222 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
223 unsigned long address, unsigned long size,
224 unsigned long entry, unsigned long page)
226 pmd_t * pmd;
227 unsigned long offset, end;
229 if (pgd_none(*dir))
230 return;
231 if (pgd_bad(*dir)) {
232 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
233 pgd_clear(dir);
234 return;
236 pmd = pmd_offset(dir, address);
237 offset = address & PGDIR_MASK;
238 address &= ~PGDIR_MASK;
239 end = address + size;
240 if (end > PGDIR_SIZE)
241 end = PGDIR_SIZE;
242 do {
243 unuse_pmd(vma, pmd, address, end - address, offset, entry,
244 page);
245 address = (address + PMD_SIZE) & PMD_MASK;
246 pmd++;
247 } while (address < end);
250 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
251 unsigned long entry, unsigned long page)
253 unsigned long start = vma->vm_start, end = vma->vm_end;
255 while (start < end) {
256 unuse_pgd(vma, pgdir, start, end - start, entry, page);
257 start = (start + PGDIR_SIZE) & PGDIR_MASK;
258 pgdir++;
262 static void unuse_process(struct mm_struct * mm, unsigned long entry,
263 unsigned long page)
265 struct vm_area_struct* vma;
268 * Go through process' page directory.
270 if (!mm || mm == &init_mm)
271 return;
272 for (vma = mm->mmap; vma; vma = vma->vm_next) {
273 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
274 unuse_vma(vma, pgd, entry, page);
276 return;
280 * We completely avoid races by reading each swap page in advance,
281 * and then search for the process using it. All the necessary
282 * page table adjustments can then be made atomically.
284 static int try_to_unuse(unsigned int type)
286 struct swap_info_struct * si = &swap_info[type];
287 struct task_struct *p;
288 struct page *page_map;
289 unsigned long entry, page;
290 int i;
292 while (1) {
294 * Find a swap page in use and read it in.
296 for (i = 1; i < si->max ; i++) {
297 if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
298 goto found_entry;
301 break;
303 found_entry:
304 entry = SWP_ENTRY(type, i);
306 /* Get a page for the entry, using the existing swap
307 cache page if there is one. Otherwise, get a clean
308 page and read the swap into it. */
309 page_map = read_swap_cache(entry);
310 if (!page_map) {
312 * Continue searching if the entry became unused.
314 if (si->swap_map[i] == 0)
315 continue;
316 return -ENOMEM;
318 page = page_address(page_map);
319 read_lock(&tasklist_lock);
320 for_each_task(p)
321 unuse_process(p->mm, entry, page);
322 read_unlock(&tasklist_lock);
323 shm_unuse(entry, page);
324 /* Now get rid of the extra reference to the temporary
325 page we've been using. */
326 if (PageSwapCache(page_map))
327 delete_from_swap_cache(page_map);
328 __free_page(page_map);
330 * Check for and clear any overflowed swap map counts.
332 if (si->swap_map[i] != 0) {
333 if (si->swap_map[i] != SWAP_MAP_MAX)
334 printk(KERN_ERR
335 "try_to_unuse: entry %08lx count=%d\n",
336 entry, si->swap_map[i]);
337 si->swap_map[i] = 0;
338 nr_swap_pages++;
341 return 0;
344 asmlinkage int sys_swapoff(const char * specialfile)
346 struct swap_info_struct * p = NULL;
347 struct dentry * dentry;
348 struct file filp;
349 int i, type, prev;
350 int err = -EPERM;
352 lock_kernel();
353 if (!capable(CAP_SYS_ADMIN))
354 goto out;
356 dentry = namei(specialfile);
357 err = PTR_ERR(dentry);
358 if (IS_ERR(dentry))
359 goto out;
361 prev = -1;
362 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
363 p = swap_info + type;
364 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
365 if (p->swap_file) {
366 if (p->swap_file == dentry)
367 break;
368 } else {
369 if (S_ISBLK(dentry->d_inode->i_mode)
370 && (p->swap_device == dentry->d_inode->i_rdev))
371 break;
374 prev = type;
376 err = -EINVAL;
377 if (type < 0)
378 goto out_dput;
380 if (prev < 0) {
381 swap_list.head = p->next;
382 } else {
383 swap_info[prev].next = p->next;
385 if (type == swap_list.next) {
386 /* just pick something that's safe... */
387 swap_list.next = swap_list.head;
389 p->flags = SWP_USED;
390 err = try_to_unuse(type);
391 if (err) {
392 /* re-insert swap space back into swap_list */
393 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
394 if (p->prio >= swap_info[i].prio)
395 break;
396 p->next = i;
397 if (prev < 0)
398 swap_list.head = swap_list.next = p - swap_info;
399 else
400 swap_info[prev].next = p - swap_info;
401 p->flags = SWP_WRITEOK;
402 goto out_dput;
404 if(p->swap_device){
405 memset(&filp, 0, sizeof(filp));
406 filp.f_dentry = dentry;
407 filp.f_mode = 3; /* read write */
408 /* open it again to get fops */
409 if( !blkdev_open(dentry->d_inode, &filp) &&
410 filp.f_op && filp.f_op->release){
411 filp.f_op->release(dentry->d_inode,&filp);
412 filp.f_op->release(dentry->d_inode,&filp);
415 dput(dentry);
417 dentry = p->swap_file;
418 p->swap_file = NULL;
419 nr_swap_pages -= p->pages;
420 p->swap_device = 0;
421 vfree(p->swap_map);
422 p->swap_map = NULL;
423 p->flags = 0;
424 err = 0;
426 out_dput:
427 dput(dentry);
428 out:
429 unlock_kernel();
430 return err;
433 int get_swaparea_info(char *buf)
435 char * page = (char *) __get_free_page(GFP_KERNEL);
436 struct swap_info_struct *ptr = swap_info;
437 int i, j, len = 0, usedswap;
439 if (!page)
440 return -ENOMEM;
442 len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
443 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
444 if (ptr->flags & SWP_USED) {
445 char * path = d_path(ptr->swap_file, page, PAGE_SIZE);
447 len += sprintf(buf + len, "%-31s ", path);
449 if (!ptr->swap_device)
450 len += sprintf(buf + len, "file\t\t");
451 else
452 len += sprintf(buf + len, "partition\t");
454 usedswap = 0;
455 for (j = 0; j < ptr->max; ++j)
456 switch (ptr->swap_map[j]) {
457 case SWAP_MAP_BAD:
458 case 0:
459 continue;
460 default:
461 usedswap++;
463 len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
464 usedswap << (PAGE_SHIFT - 10), ptr->prio);
467 free_page((unsigned long) page);
468 return len;
471 int is_swap_partition(kdev_t dev) {
472 struct swap_info_struct *ptr = swap_info;
473 int i;
475 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
476 if (ptr->flags & SWP_USED)
477 if (ptr->swap_device == dev)
478 return 1;
480 return 0;
484 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
486 * The swapon system call
488 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
490 struct swap_info_struct * p;
491 struct dentry * swap_dentry;
492 unsigned int type;
493 int i, j, prev;
494 int error = -EPERM;
495 struct file filp;
496 static int least_priority = 0;
497 union swap_header *swap_header = 0;
498 int swap_header_version;
499 int lock_map_size = PAGE_SIZE;
500 int nr_good_pages = 0;
501 unsigned long maxpages;
502 int swapfilesize;
504 lock_kernel();
505 if (!capable(CAP_SYS_ADMIN))
506 goto out;
507 memset(&filp, 0, sizeof(filp));
508 p = swap_info;
509 for (type = 0 ; type < nr_swapfiles ; type++,p++)
510 if (!(p->flags & SWP_USED))
511 break;
512 if (type >= MAX_SWAPFILES)
513 goto out;
514 if (type >= nr_swapfiles)
515 nr_swapfiles = type+1;
516 p->flags = SWP_USED;
517 p->swap_file = NULL;
518 p->swap_device = 0;
519 p->swap_map = NULL;
520 p->lowest_bit = 0;
521 p->highest_bit = 0;
522 p->cluster_nr = 0;
523 p->max = 1;
524 p->next = -1;
525 if (swap_flags & SWAP_FLAG_PREFER) {
526 p->prio =
527 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
528 } else {
529 p->prio = --least_priority;
531 swap_dentry = namei(specialfile);
532 error = PTR_ERR(swap_dentry);
533 if (IS_ERR(swap_dentry))
534 goto bad_swap_2;
536 p->swap_file = swap_dentry;
537 error = -EINVAL;
539 if (S_ISBLK(swap_dentry->d_inode->i_mode)) {
540 kdev_t dev = swap_dentry->d_inode->i_rdev;
542 p->swap_device = dev;
543 set_blocksize(dev, PAGE_SIZE);
545 filp.f_dentry = swap_dentry;
546 filp.f_mode = 3; /* read write */
547 error = blkdev_open(swap_dentry->d_inode, &filp);
548 if (error)
549 goto bad_swap_2;
550 set_blocksize(dev, PAGE_SIZE);
551 error = -ENODEV;
552 if (!dev || (blk_size[MAJOR(dev)] &&
553 !blk_size[MAJOR(dev)][MINOR(dev)]))
554 goto bad_swap;
555 error = -EBUSY;
556 for (i = 0 ; i < nr_swapfiles ; i++) {
557 if (i == type)
558 continue;
559 if (dev == swap_info[i].swap_device)
560 goto bad_swap;
562 swapfilesize = 0;
563 if (blk_size[MAJOR(dev)])
564 swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
565 / (PAGE_SIZE / 1024);
566 } else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
567 error = -EBUSY;
568 for (i = 0 ; i < nr_swapfiles ; i++) {
569 if (i == type || !swap_info[i].swap_file)
570 continue;
571 if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode)
572 goto bad_swap;
574 swapfilesize = swap_dentry->d_inode->i_size / PAGE_SIZE;
575 } else
576 goto bad_swap;
578 swap_header = (void *) __get_free_page(GFP_USER);
579 if (!swap_header) {
580 printk("Unable to start swapping: out of memory :-)\n");
581 error = -ENOMEM;
582 goto bad_swap;
585 lock_page(mem_map + MAP_NR(swap_header));
586 rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header, 1);
588 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
589 swap_header_version = 1;
590 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
591 swap_header_version = 2;
592 else {
593 printk("Unable to find swap-space signature\n");
594 error = -EINVAL;
595 goto bad_swap;
598 switch (swap_header_version) {
599 case 1:
600 memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
601 j = 0;
602 p->lowest_bit = 0;
603 p->highest_bit = 0;
604 for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
605 if (test_bit(i,(char *) swap_header)) {
606 if (!p->lowest_bit)
607 p->lowest_bit = i;
608 p->highest_bit = i;
609 p->max = i+1;
610 j++;
613 nr_good_pages = j;
614 p->swap_map = vmalloc(p->max * sizeof(short));
615 if (!p->swap_map) {
616 error = -ENOMEM;
617 goto bad_swap;
619 for (i = 1 ; i < p->max ; i++) {
620 if (test_bit(i,(char *) swap_header))
621 p->swap_map[i] = 0;
622 else
623 p->swap_map[i] = SWAP_MAP_BAD;
625 break;
627 case 2:
628 /* Check the swap header's sub-version and the size of
629 the swap file and bad block lists */
630 if (swap_header->info.version != 1) {
631 printk(KERN_WARNING
632 "Unable to handle swap header version %d\n",
633 swap_header->info.version);
634 error = -EINVAL;
635 goto bad_swap;
638 p->lowest_bit = 1;
639 p->highest_bit = swap_header->info.last_page - 1;
640 p->max = swap_header->info.last_page;
642 maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL));
643 if (p->max >= maxpages)
644 p->max = maxpages-1;
646 error = -EINVAL;
647 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
648 goto bad_swap;
650 /* OK, set up the swap map and apply the bad block list */
651 if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
652 error = -ENOMEM;
653 goto bad_swap;
656 error = 0;
657 memset(p->swap_map, 0, p->max * sizeof(short));
658 for (i=0; i<swap_header->info.nr_badpages; i++) {
659 int page = swap_header->info.badpages[i];
660 if (page <= 0 || page >= swap_header->info.last_page)
661 error = -EINVAL;
662 else
663 p->swap_map[page] = SWAP_MAP_BAD;
665 nr_good_pages = swap_header->info.last_page - i;
666 lock_map_size = (p->max + 7) / 8;
667 if (error)
668 goto bad_swap;
671 if (swapfilesize && p->max > swapfilesize) {
672 printk(KERN_WARNING
673 "Swap area shorter than signature indicates\n");
674 error = -EINVAL;
675 goto bad_swap;
677 if (!nr_good_pages) {
678 printk(KERN_WARNING "Empty swap-file\n");
679 error = -EINVAL;
680 goto bad_swap;
682 p->swap_map[0] = SWAP_MAP_BAD;
683 p->flags = SWP_WRITEOK;
684 p->pages = nr_good_pages;
685 nr_swap_pages += nr_good_pages;
686 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
687 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
689 /* insert swap space into swap_list: */
690 prev = -1;
691 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
692 if (p->prio >= swap_info[i].prio) {
693 break;
695 prev = i;
697 p->next = i;
698 if (prev < 0) {
699 swap_list.head = swap_list.next = p - swap_info;
700 } else {
701 swap_info[prev].next = p - swap_info;
703 error = 0;
704 goto out;
705 bad_swap:
706 if(filp.f_op && filp.f_op->release)
707 filp.f_op->release(filp.f_dentry->d_inode,&filp);
708 bad_swap_2:
709 if (p->swap_map)
710 vfree(p->swap_map);
711 dput(p->swap_file);
712 p->swap_device = 0;
713 p->swap_file = NULL;
714 p->swap_map = NULL;
715 p->flags = 0;
716 if (!(swap_flags & SWAP_FLAG_PREFER))
717 ++least_priority;
718 out:
719 if (swap_header)
720 free_page((long) swap_header);
721 unlock_kernel();
722 return error;
725 void si_swapinfo(struct sysinfo *val)
727 unsigned int i, j;
729 val->freeswap = val->totalswap = 0;
730 for (i = 0; i < nr_swapfiles; i++) {
731 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
732 continue;
733 for (j = 0; j < swap_info[i].max; ++j)
734 switch (swap_info[i].swap_map[j]) {
735 case SWAP_MAP_BAD:
736 continue;
737 case 0:
738 ++val->freeswap;
739 default:
740 ++val->totalswap;
743 val->freeswap <<= PAGE_SHIFT;
744 val->totalswap <<= PAGE_SHIFT;
745 return;