Import 2.3.13pre6
[davej-history.git] / mm / swapfile.c
blob5a7a148ea8cc48e5bb79dcb0e5d376c27e1828c0
1 /*
2 * linux/mm/swapfile.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
6 */
8 #include <linux/malloc.h>
9 #include <linux/smp_lock.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/blkdev.h> /* for blk_size */
14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h>
16 #include <linux/shm.h>
18 #include <asm/pgtable.h>
20 unsigned int nr_swapfiles = 0;
22 struct swap_list_t swap_list = {-1, -1};
24 struct swap_info_struct swap_info[MAX_SWAPFILES];
26 #define SWAPFILE_CLUSTER 256
28 static inline int scan_swap_map(struct swap_info_struct *si)
30 unsigned long offset;
31 /*
32 * We try to cluster swap pages by allocating them
33 * sequentially in swap. Once we've allocated
34 * SWAPFILE_CLUSTER pages this way, however, we resort to
35 * first-free allocation, starting a new cluster. This
36 * prevents us from scattering swap pages all over the entire
37 * swap partition, so that we reduce overall disk seek times
38 * between swap pages. -- sct */
39 if (si->cluster_nr) {
40 while (si->cluster_next <= si->highest_bit) {
41 offset = si->cluster_next++;
42 if (si->swap_map[offset])
43 continue;
44 si->cluster_nr--;
45 goto got_page;
48 si->cluster_nr = SWAPFILE_CLUSTER;
49 for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
50 if (si->swap_map[offset])
51 continue;
52 si->lowest_bit = offset;
53 got_page:
54 si->swap_map[offset] = 1;
55 nr_swap_pages--;
56 if (offset == si->highest_bit)
57 si->highest_bit--;
58 si->cluster_next = offset;
59 return offset;
61 return 0;
64 unsigned long get_swap_page(void)
66 struct swap_info_struct * p;
67 unsigned long offset, entry;
68 int type, wrapped = 0;
70 type = swap_list.next;
71 if (type < 0)
72 return 0;
73 if (nr_swap_pages == 0)
74 return 0;
76 while (1) {
77 p = &swap_info[type];
78 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
79 offset = scan_swap_map(p);
80 if (offset) {
81 entry = SWP_ENTRY(type,offset);
82 type = swap_info[type].next;
83 if (type < 0 ||
84 p->prio != swap_info[type].prio)
86 swap_list.next = swap_list.head;
88 else
90 swap_list.next = type;
92 return entry;
95 type = p->next;
96 if (!wrapped) {
97 if (type < 0 || p->prio != swap_info[type].prio) {
98 type = swap_list.head;
99 wrapped = 1;
101 } else if (type < 0) {
102 return 0; /* out of swap space */
108 void swap_free(unsigned long entry)
110 struct swap_info_struct * p;
111 unsigned long offset, type;
113 if (!entry)
114 goto out;
116 type = SWP_TYPE(entry);
117 if (type & SHM_SWP_TYPE)
118 goto out;
119 if (type >= nr_swapfiles)
120 goto bad_nofile;
121 p = & swap_info[type];
122 if (!(p->flags & SWP_USED))
123 goto bad_device;
124 if (p->prio > swap_info[swap_list.next].prio)
125 swap_list.next = swap_list.head;
126 offset = SWP_OFFSET(entry);
127 if (offset >= p->max)
128 goto bad_offset;
129 if (offset < p->lowest_bit)
130 p->lowest_bit = offset;
131 if (offset > p->highest_bit)
132 p->highest_bit = offset;
133 if (!p->swap_map[offset])
134 goto bad_free;
135 if (p->swap_map[offset] < SWAP_MAP_MAX) {
136 if (!--p->swap_map[offset])
137 nr_swap_pages++;
139 #ifdef DEBUG_SWAP
140 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
141 entry, p->swap_map[offset]);
142 #endif
143 out:
144 return;
146 bad_nofile:
147 printk("swap_free: Trying to free nonexistent swap-page\n");
148 goto out;
149 bad_device:
150 printk("swap_free: Trying to free swap from unused swap-device\n");
151 goto out;
152 bad_offset:
153 printk("swap_free: offset exceeds max\n");
154 goto out;
155 bad_free:
156 printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
157 goto out;
161 * The swap entry has been read in advance, and we return 1 to indicate
162 * that the page has been used or is no longer needed.
164 * Always set the resulting pte to be nowrite (the same as COW pages
165 * after one process has exited). We don't know just how many PTEs will
166 * share this swap entry, so be cautious and let do_wp_page work out
167 * what to do if a write is requested later.
169 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
170 pte_t *dir, unsigned long entry, unsigned long page)
172 pte_t pte = *dir;
174 if (pte_none(pte))
175 return;
176 if (pte_present(pte)) {
177 /* If this entry is swap-cached, then page must already
178 hold the right address for any copies in physical
179 memory */
180 if (pte_page(pte) != page)
181 return;
182 /* We will be removing the swap cache in a moment, so... */
183 set_pte(dir, pte_mkdirty(pte));
184 return;
186 if (pte_val(pte) != entry)
187 return;
188 set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
189 swap_free(entry);
190 get_page(mem_map + MAP_NR(page));
191 ++vma->vm_mm->rss;
194 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
195 unsigned long address, unsigned long size, unsigned long offset,
196 unsigned long entry, unsigned long page)
198 pte_t * pte;
199 unsigned long end;
201 if (pmd_none(*dir))
202 return;
203 if (pmd_bad(*dir)) {
204 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
205 pmd_clear(dir);
206 return;
208 pte = pte_offset(dir, address);
209 offset += address & PMD_MASK;
210 address &= ~PMD_MASK;
211 end = address + size;
212 if (end > PMD_SIZE)
213 end = PMD_SIZE;
214 do {
215 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
216 address += PAGE_SIZE;
217 pte++;
218 } while (address < end);
221 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
222 unsigned long address, unsigned long size,
223 unsigned long entry, unsigned long page)
225 pmd_t * pmd;
226 unsigned long offset, end;
228 if (pgd_none(*dir))
229 return;
230 if (pgd_bad(*dir)) {
231 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
232 pgd_clear(dir);
233 return;
235 pmd = pmd_offset(dir, address);
236 offset = address & PGDIR_MASK;
237 address &= ~PGDIR_MASK;
238 end = address + size;
239 if (end > PGDIR_SIZE)
240 end = PGDIR_SIZE;
241 do {
242 unuse_pmd(vma, pmd, address, end - address, offset, entry,
243 page);
244 address = (address + PMD_SIZE) & PMD_MASK;
245 pmd++;
246 } while (address < end);
249 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
250 unsigned long entry, unsigned long page)
252 unsigned long start = vma->vm_start, end = vma->vm_end;
254 while (start < end) {
255 unuse_pgd(vma, pgdir, start, end - start, entry, page);
256 start = (start + PGDIR_SIZE) & PGDIR_MASK;
257 pgdir++;
261 static void unuse_process(struct mm_struct * mm, unsigned long entry,
262 unsigned long page)
264 struct vm_area_struct* vma;
267 * Go through process' page directory.
269 if (!mm)
270 return;
271 for (vma = mm->mmap; vma; vma = vma->vm_next) {
272 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
273 unuse_vma(vma, pgd, entry, page);
275 return;
279 * We completely avoid races by reading each swap page in advance,
280 * and then search for the process using it. All the necessary
281 * page table adjustments can then be made atomically.
283 static int try_to_unuse(unsigned int type)
285 struct swap_info_struct * si = &swap_info[type];
286 struct task_struct *p;
287 struct page *page_map;
288 unsigned long entry, page;
289 int i;
291 while (1) {
293 * Find a swap page in use and read it in.
295 for (i = 1; i < si->max ; i++) {
296 if (si->swap_map[i] > 0 && si->swap_map[i] != SWAP_MAP_BAD) {
297 goto found_entry;
300 break;
302 found_entry:
303 entry = SWP_ENTRY(type, i);
305 /* Get a page for the entry, using the existing swap
306 cache page if there is one. Otherwise, get a clean
307 page and read the swap into it. */
308 page_map = read_swap_cache(entry);
309 if (!page_map) {
311 * Continue searching if the entry became unused.
313 if (si->swap_map[i] == 0)
314 continue;
315 return -ENOMEM;
317 page = page_address(page_map);
318 read_lock(&tasklist_lock);
319 for_each_task(p)
320 unuse_process(p->mm, entry, page);
321 read_unlock(&tasklist_lock);
322 shm_unuse(entry, page);
323 /* Now get rid of the extra reference to the temporary
324 page we've been using. */
325 if (PageSwapCache(page_map))
326 delete_from_swap_cache(page_map);
327 __free_page(page_map);
329 * Check for and clear any overflowed swap map counts.
331 if (si->swap_map[i] != 0) {
332 if (si->swap_map[i] != SWAP_MAP_MAX)
333 printk(KERN_ERR
334 "try_to_unuse: entry %08lx count=%d\n",
335 entry, si->swap_map[i]);
336 si->swap_map[i] = 0;
337 nr_swap_pages++;
340 return 0;
343 asmlinkage int sys_swapoff(const char * specialfile)
345 struct swap_info_struct * p = NULL;
346 struct dentry * dentry;
347 struct file filp;
348 int i, type, prev;
349 int err = -EPERM;
351 lock_kernel();
352 if (!capable(CAP_SYS_ADMIN))
353 goto out;
355 dentry = namei(specialfile);
356 err = PTR_ERR(dentry);
357 if (IS_ERR(dentry))
358 goto out;
360 prev = -1;
361 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
362 p = swap_info + type;
363 if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
364 if (p->swap_file) {
365 if (p->swap_file == dentry)
366 break;
367 } else {
368 if (S_ISBLK(dentry->d_inode->i_mode)
369 && (p->swap_device == dentry->d_inode->i_rdev))
370 break;
373 prev = type;
375 err = -EINVAL;
376 if (type < 0)
377 goto out_dput;
379 if (prev < 0) {
380 swap_list.head = p->next;
381 } else {
382 swap_info[prev].next = p->next;
384 if (type == swap_list.next) {
385 /* just pick something that's safe... */
386 swap_list.next = swap_list.head;
388 p->flags = SWP_USED;
389 err = try_to_unuse(type);
390 if (err) {
391 /* re-insert swap space back into swap_list */
392 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
393 if (p->prio >= swap_info[i].prio)
394 break;
395 p->next = i;
396 if (prev < 0)
397 swap_list.head = swap_list.next = p - swap_info;
398 else
399 swap_info[prev].next = p - swap_info;
400 p->flags = SWP_WRITEOK;
401 goto out_dput;
403 if(p->swap_device){
404 memset(&filp, 0, sizeof(filp));
405 filp.f_dentry = dentry;
406 filp.f_mode = 3; /* read write */
407 /* open it again to get fops */
408 if( !blkdev_open(dentry->d_inode, &filp) &&
409 filp.f_op && filp.f_op->release){
410 filp.f_op->release(dentry->d_inode,&filp);
411 filp.f_op->release(dentry->d_inode,&filp);
414 dput(dentry);
416 dentry = p->swap_file;
417 p->swap_file = NULL;
418 nr_swap_pages -= p->pages;
419 p->swap_device = 0;
420 vfree(p->swap_map);
421 p->swap_map = NULL;
422 p->flags = 0;
423 err = 0;
425 out_dput:
426 dput(dentry);
427 out:
428 unlock_kernel();
429 return err;
432 int get_swaparea_info(char *buf)
434 char * page = (char *) __get_free_page(GFP_KERNEL);
435 struct swap_info_struct *ptr = swap_info;
436 int i, j, len = 0, usedswap;
438 if (!page)
439 return -ENOMEM;
441 len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
442 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
443 if (ptr->flags & SWP_USED) {
444 char * path = d_path(ptr->swap_file, page, PAGE_SIZE);
446 len += sprintf(buf + len, "%-31s ", path);
448 if (!ptr->swap_device)
449 len += sprintf(buf + len, "file\t\t");
450 else
451 len += sprintf(buf + len, "partition\t");
453 usedswap = 0;
454 for (j = 0; j < ptr->max; ++j)
455 switch (ptr->swap_map[j]) {
456 case SWAP_MAP_BAD:
457 case 0:
458 continue;
459 default:
460 usedswap++;
462 len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10),
463 usedswap << (PAGE_SHIFT - 10), ptr->prio);
466 free_page((unsigned long) page);
467 return len;
470 int is_swap_partition(kdev_t dev) {
471 struct swap_info_struct *ptr = swap_info;
472 int i;
474 for (i = 0 ; i < nr_swapfiles ; i++, ptr++) {
475 if (ptr->flags & SWP_USED)
476 if (ptr->swap_device == dev)
477 return 1;
479 return 0;
483 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
485 * The swapon system call
487 asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
489 struct swap_info_struct * p;
490 struct dentry * swap_dentry;
491 unsigned int type;
492 int i, j, prev;
493 int error = -EPERM;
494 struct file filp;
495 static int least_priority = 0;
496 union swap_header *swap_header = 0;
497 int swap_header_version;
498 int lock_map_size = PAGE_SIZE;
499 int nr_good_pages = 0;
500 unsigned long maxpages;
501 int swapfilesize;
503 lock_kernel();
504 if (!capable(CAP_SYS_ADMIN))
505 goto out;
506 memset(&filp, 0, sizeof(filp));
507 p = swap_info;
508 for (type = 0 ; type < nr_swapfiles ; type++,p++)
509 if (!(p->flags & SWP_USED))
510 break;
511 if (type >= MAX_SWAPFILES)
512 goto out;
513 if (type >= nr_swapfiles)
514 nr_swapfiles = type+1;
515 p->flags = SWP_USED;
516 p->swap_file = NULL;
517 p->swap_device = 0;
518 p->swap_map = NULL;
519 p->lowest_bit = 0;
520 p->highest_bit = 0;
521 p->cluster_nr = 0;
522 p->max = 1;
523 p->next = -1;
524 if (swap_flags & SWAP_FLAG_PREFER) {
525 p->prio =
526 (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
527 } else {
528 p->prio = --least_priority;
530 swap_dentry = namei(specialfile);
531 error = PTR_ERR(swap_dentry);
532 if (IS_ERR(swap_dentry))
533 goto bad_swap_2;
535 p->swap_file = swap_dentry;
536 error = -EINVAL;
538 if (S_ISBLK(swap_dentry->d_inode->i_mode)) {
539 kdev_t dev = swap_dentry->d_inode->i_rdev;
541 p->swap_device = dev;
542 set_blocksize(dev, PAGE_SIZE);
544 filp.f_dentry = swap_dentry;
545 filp.f_mode = 3; /* read write */
546 error = blkdev_open(swap_dentry->d_inode, &filp);
547 if (error)
548 goto bad_swap_2;
549 set_blocksize(dev, PAGE_SIZE);
550 error = -ENODEV;
551 if (!dev || (blk_size[MAJOR(dev)] &&
552 !blk_size[MAJOR(dev)][MINOR(dev)]))
553 goto bad_swap;
554 error = -EBUSY;
555 for (i = 0 ; i < nr_swapfiles ; i++) {
556 if (i == type)
557 continue;
558 if (dev == swap_info[i].swap_device)
559 goto bad_swap;
561 swapfilesize = 0;
562 if (blk_size[MAJOR(dev)])
563 swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
564 / (PAGE_SIZE / 1024);
565 } else if (S_ISREG(swap_dentry->d_inode->i_mode)) {
566 error = -EBUSY;
567 for (i = 0 ; i < nr_swapfiles ; i++) {
568 if (i == type || !swap_info[i].swap_file)
569 continue;
570 if (swap_dentry->d_inode == swap_info[i].swap_file->d_inode)
571 goto bad_swap;
573 swapfilesize = swap_dentry->d_inode->i_size / PAGE_SIZE;
574 } else
575 goto bad_swap;
577 swap_header = (void *) __get_free_page(GFP_USER);
578 if (!swap_header) {
579 printk("Unable to start swapping: out of memory :-)\n");
580 error = -ENOMEM;
581 goto bad_swap;
584 lock_page(mem_map + MAP_NR(swap_header));
585 rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header, 1);
587 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
588 swap_header_version = 1;
589 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
590 swap_header_version = 2;
591 else {
592 printk("Unable to find swap-space signature\n");
593 error = -EINVAL;
594 goto bad_swap;
597 switch (swap_header_version) {
598 case 1:
599 memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
600 j = 0;
601 p->lowest_bit = 0;
602 p->highest_bit = 0;
603 for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
604 if (test_bit(i,(char *) swap_header)) {
605 if (!p->lowest_bit)
606 p->lowest_bit = i;
607 p->highest_bit = i;
608 p->max = i+1;
609 j++;
612 nr_good_pages = j;
613 p->swap_map = vmalloc(p->max * sizeof(short));
614 if (!p->swap_map) {
615 error = -ENOMEM;
616 goto bad_swap;
618 for (i = 1 ; i < p->max ; i++) {
619 if (test_bit(i,(char *) swap_header))
620 p->swap_map[i] = 0;
621 else
622 p->swap_map[i] = SWAP_MAP_BAD;
624 break;
626 case 2:
627 /* Check the swap header's sub-version and the size of
628 the swap file and bad block lists */
629 if (swap_header->info.version != 1) {
630 printk(KERN_WARNING
631 "Unable to handle swap header version %d\n",
632 swap_header->info.version);
633 error = -EINVAL;
634 goto bad_swap;
637 p->lowest_bit = 1;
638 p->highest_bit = swap_header->info.last_page - 1;
639 p->max = swap_header->info.last_page;
641 maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL));
642 if (p->max >= maxpages)
643 p->max = maxpages-1;
645 error = -EINVAL;
646 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
647 goto bad_swap;
649 /* OK, set up the swap map and apply the bad block list */
650 if (!(p->swap_map = vmalloc (p->max * sizeof(short)))) {
651 error = -ENOMEM;
652 goto bad_swap;
655 error = 0;
656 memset(p->swap_map, 0, p->max * sizeof(short));
657 for (i=0; i<swap_header->info.nr_badpages; i++) {
658 int page = swap_header->info.badpages[i];
659 if (page <= 0 || page >= swap_header->info.last_page)
660 error = -EINVAL;
661 else
662 p->swap_map[page] = SWAP_MAP_BAD;
664 nr_good_pages = swap_header->info.last_page -
665 swap_header->info.nr_badpages -
666 1 /* header page */;
667 lock_map_size = (p->max + 7) / 8;
668 if (error)
669 goto bad_swap;
672 if (swapfilesize && p->max > swapfilesize) {
673 printk(KERN_WARNING
674 "Swap area shorter than signature indicates\n");
675 error = -EINVAL;
676 goto bad_swap;
678 if (!nr_good_pages) {
679 printk(KERN_WARNING "Empty swap-file\n");
680 error = -EINVAL;
681 goto bad_swap;
683 p->swap_map[0] = SWAP_MAP_BAD;
684 p->flags = SWP_WRITEOK;
685 p->pages = nr_good_pages;
686 nr_swap_pages += nr_good_pages;
687 printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
688 nr_good_pages<<(PAGE_SHIFT-10), p->prio);
690 /* insert swap space into swap_list: */
691 prev = -1;
692 for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
693 if (p->prio >= swap_info[i].prio) {
694 break;
696 prev = i;
698 p->next = i;
699 if (prev < 0) {
700 swap_list.head = swap_list.next = p - swap_info;
701 } else {
702 swap_info[prev].next = p - swap_info;
704 error = 0;
705 goto out;
706 bad_swap:
707 if(filp.f_op && filp.f_op->release)
708 filp.f_op->release(filp.f_dentry->d_inode,&filp);
709 bad_swap_2:
710 if (p->swap_map)
711 vfree(p->swap_map);
712 dput(p->swap_file);
713 p->swap_device = 0;
714 p->swap_file = NULL;
715 p->swap_map = NULL;
716 p->flags = 0;
717 if (!(swap_flags & SWAP_FLAG_PREFER))
718 ++least_priority;
719 out:
720 if (swap_header)
721 free_page((long) swap_header);
722 unlock_kernel();
723 return error;
726 void si_swapinfo(struct sysinfo *val)
728 unsigned int i, j;
730 val->freeswap = val->totalswap = 0;
731 for (i = 0; i < nr_swapfiles; i++) {
732 if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
733 continue;
734 for (j = 0; j < swap_info[i].max; ++j)
735 switch (swap_info[i].swap_map[j]) {
736 case SWAP_MAP_BAD:
737 continue;
738 case 0:
739 ++val->freeswap;
740 default:
741 ++val->totalswap;
744 val->freeswap <<= PAGE_SHIFT;
745 val->totalswap <<= PAGE_SHIFT;
746 return;