4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
10 #include <linux/smp_lock.h>
11 #include <linux/sched.h>
12 #include <linux/head.h>
13 #include <linux/kernel.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/stat.h>
18 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/malloc.h>
22 #include <linux/blkdev.h> /* for blk_size */
23 #include <linux/vmalloc.h>
24 #include <linux/pagemap.h>
25 #include <linux/shm.h>
27 #include <asm/bitops.h>
28 #include <asm/pgtable.h>
30 unsigned int nr_swapfiles
= 0;
32 struct swap_list_t swap_list
= {-1, -1};
34 struct swap_info_struct swap_info
[MAX_SWAPFILES
];
37 static inline int scan_swap_map(struct swap_info_struct
*si
)
41 * We try to cluster swap pages by allocating them
42 * sequentially in swap. Once we've allocated
43 * SWAP_CLUSTER_MAX pages this way, however, we resort to
44 * first-free allocation, starting a new cluster. This
45 * prevents us from scattering swap pages all over the entire
46 * swap partition, so that we reduce overall disk seek times
47 * between swap pages. -- sct */
49 while (si
->cluster_next
<= si
->highest_bit
) {
50 offset
= si
->cluster_next
++;
51 if (si
->swap_map
[offset
])
53 if (test_bit(offset
, si
->swap_lockmap
))
59 si
->cluster_nr
= SWAP_CLUSTER_MAX
;
60 for (offset
= si
->lowest_bit
; offset
<= si
->highest_bit
; offset
++) {
61 if (si
->swap_map
[offset
])
63 if (test_bit(offset
, si
->swap_lockmap
))
65 si
->lowest_bit
= offset
;
67 si
->swap_map
[offset
] = 1;
69 if (offset
== si
->highest_bit
)
71 si
->cluster_next
= offset
;
77 unsigned long get_swap_page(void)
79 struct swap_info_struct
* p
;
80 unsigned long offset
, entry
;
81 int type
, wrapped
= 0;
83 type
= swap_list
.next
;
86 if (nr_swap_pages
== 0)
91 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
92 offset
= scan_swap_map(p
);
94 entry
= SWP_ENTRY(type
,offset
);
95 type
= swap_info
[type
].next
;
97 p
->prio
!= swap_info
[type
].prio
)
99 swap_list
.next
= swap_list
.head
;
103 swap_list
.next
= type
;
110 if (type
< 0 || p
->prio
!= swap_info
[type
].prio
) {
111 type
= swap_list
.head
;
114 } else if (type
< 0) {
115 return 0; /* out of swap space */
121 void swap_free(unsigned long entry
)
123 struct swap_info_struct
* p
;
124 unsigned long offset
, type
;
129 type
= SWP_TYPE(entry
);
130 if (type
& SHM_SWP_TYPE
)
132 if (type
>= nr_swapfiles
)
134 p
= & swap_info
[type
];
135 if (!(p
->flags
& SWP_USED
))
137 if (p
->prio
> swap_info
[swap_list
.next
].prio
)
138 swap_list
.next
= swap_list
.head
;
139 offset
= SWP_OFFSET(entry
);
140 if (offset
>= p
->max
)
142 if (offset
< p
->lowest_bit
)
143 p
->lowest_bit
= offset
;
144 if (offset
> p
->highest_bit
)
145 p
->highest_bit
= offset
;
146 if (!p
->swap_map
[offset
])
148 if (p
->swap_map
[offset
] < SWAP_MAP_MAX
) {
149 if (!--p
->swap_map
[offset
])
153 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
154 entry
, p
->swap_map
[offset
]);
160 printk("swap_free: Trying to free nonexistent swap-page\n");
163 printk("swap_free: Trying to free swap from unused swap-device\n");
166 printk("swap_free: offset exceeds max\n");
169 printk("swap_free: swap-space map bad (entry %08lx)\n",entry
);
174 * The swap entry has been read in advance, and we return 1 to indicate
175 * that the page has been used or is no longer needed.
177 * Always set the resulting pte to be nowrite (the same as COW pages
178 * after one process has exited). We don't know just how many PTEs will
179 * share this swap entry, so be cautious and let do_wp_page work out
180 * what to do if a write is requested later.
182 static inline void unuse_pte(struct vm_area_struct
* vma
, unsigned long address
,
183 pte_t
*dir
, unsigned long entry
, unsigned long page
)
189 if (pte_present(pte
)) {
190 /* If this entry is swap-cached, then page must already
191 hold the right address for any copies in physical
193 if (pte_page(pte
) != page
)
195 /* We will be removing the swap cache in a moment, so... */
196 set_pte(dir
, pte_mkdirty(pte
));
199 if (pte_val(pte
) != entry
)
201 set_pte(dir
, pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
203 atomic_inc(&mem_map
[MAP_NR(page
)].count
);
207 static inline void unuse_pmd(struct vm_area_struct
* vma
, pmd_t
*dir
,
208 unsigned long address
, unsigned long size
, unsigned long offset
,
209 unsigned long entry
, unsigned long page
)
217 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir
));
221 pte
= pte_offset(dir
, address
);
222 offset
+= address
& PMD_MASK
;
223 address
&= ~PMD_MASK
;
224 end
= address
+ size
;
228 unuse_pte(vma
, offset
+address
-vma
->vm_start
, pte
, entry
, page
);
229 address
+= PAGE_SIZE
;
231 } while (address
< end
);
234 static inline void unuse_pgd(struct vm_area_struct
* vma
, pgd_t
*dir
,
235 unsigned long address
, unsigned long size
,
236 unsigned long entry
, unsigned long page
)
239 unsigned long offset
, end
;
244 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir
));
248 pmd
= pmd_offset(dir
, address
);
249 offset
= address
& PGDIR_MASK
;
250 address
&= ~PGDIR_MASK
;
251 end
= address
+ size
;
252 if (end
> PGDIR_SIZE
)
255 unuse_pmd(vma
, pmd
, address
, end
- address
, offset
, entry
,
257 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
259 } while (address
< end
);
262 static void unuse_vma(struct vm_area_struct
* vma
, pgd_t
*pgdir
,
263 unsigned long entry
, unsigned long page
)
265 unsigned long start
= vma
->vm_start
, end
= vma
->vm_end
;
267 while (start
< end
) {
268 unuse_pgd(vma
, pgdir
, start
, end
- start
, entry
, page
);
269 start
= (start
+ PGDIR_SIZE
) & PGDIR_MASK
;
274 static void unuse_process(struct mm_struct
* mm
, unsigned long entry
,
277 struct vm_area_struct
* vma
;
280 * Go through process' page directory.
282 if (!mm
|| mm
== &init_mm
)
284 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
285 pgd_t
* pgd
= pgd_offset(mm
, vma
->vm_start
);
286 unuse_vma(vma
, pgd
, entry
, page
);
292 * We completely avoid races by reading each swap page in advance,
293 * and then search for the process using it. All the necessary
294 * page table adjustments can then be made atomically.
296 static int try_to_unuse(unsigned int type
)
298 struct swap_info_struct
* si
= &swap_info
[type
];
299 struct task_struct
*p
;
300 unsigned long page
= 0;
301 struct page
*page_map
;
307 * Find a swap page in use and read it in.
309 for (i
= 1 , entry
= 0; i
< si
->max
; i
++) {
310 if (si
->swap_map
[i
] > 0 && si
->swap_map
[i
] != SWAP_MAP_BAD
) {
311 entry
= SWP_ENTRY(type
, i
);
318 /* Get a page for the entry, using the existing swap
319 cache page if there is one. Otherwise, get a clean
320 page and read the swap into it. */
321 page_map
= read_swap_cache(entry
);
324 page
= page_address(page_map
);
325 read_lock(&tasklist_lock
);
327 unuse_process(p
->mm
, entry
, page
);
328 read_unlock(&tasklist_lock
);
329 shm_unuse(entry
, page
);
330 /* Now get rid of the extra reference to the temporary
331 page we've been using. */
332 if (PageSwapCache(page_map
))
333 delete_from_swap_cache(page_map
);
335 if (si
->swap_map
[i
] != 0) {
336 if (si
->swap_map
[i
] != SWAP_MAP_MAX
)
337 printk("try_to_unuse: entry %08lx "
338 "not in use\n", entry
);
346 asmlinkage
int sys_swapoff(const char * specialfile
)
348 struct swap_info_struct
* p
= NULL
;
349 struct dentry
* dentry
;
355 if (!capable(CAP_SYS_ADMIN
))
358 dentry
= namei(specialfile
);
359 err
= PTR_ERR(dentry
);
364 for (type
= swap_list
.head
; type
>= 0; type
= swap_info
[type
].next
) {
365 p
= swap_info
+ type
;
366 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
368 if (p
->swap_file
== dentry
)
371 if (S_ISBLK(dentry
->d_inode
->i_mode
)
372 && (p
->swap_device
== dentry
->d_inode
->i_rdev
))
384 swap_list
.head
= p
->next
;
386 swap_info
[prev
].next
= p
->next
;
388 if (type
== swap_list
.next
) {
389 /* just pick something that's safe... */
390 swap_list
.next
= swap_list
.head
;
393 err
= try_to_unuse(type
);
396 /* re-insert swap space back into swap_list */
397 for (prev
= -1, i
= swap_list
.head
; i
>= 0; prev
= i
, i
= swap_info
[i
].next
)
398 if (p
->prio
>= swap_info
[i
].prio
)
402 swap_list
.head
= swap_list
.next
= p
- swap_info
;
404 swap_info
[prev
].next
= p
- swap_info
;
405 p
->flags
= SWP_WRITEOK
;
409 memset(&filp
, 0, sizeof(filp
));
410 filp
.f_dentry
= dentry
;
411 filp
.f_mode
= 3; /* read write */
412 /* open it again to get fops */
413 if( !blkdev_open(dentry
->d_inode
, &filp
) &&
414 filp
.f_op
&& filp
.f_op
->release
){
415 filp
.f_op
->release(dentry
->d_inode
,&filp
);
416 filp
.f_op
->release(dentry
->d_inode
,&filp
);
421 nr_swap_pages
-= p
->pages
;
427 vfree(p
->swap_lockmap
);
428 p
->swap_lockmap
= NULL
;
436 int get_swaparea_info(char *buf
)
438 char * page
= (char *) __get_free_page(GFP_KERNEL
);
439 struct swap_info_struct
*ptr
= swap_info
;
440 int i
, j
, len
= 0, usedswap
;
445 len
+= sprintf(buf
, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
446 for (i
= 0 ; i
< nr_swapfiles
; i
++, ptr
++) {
447 if (ptr
->flags
& SWP_USED
) {
448 char * path
= d_path(ptr
->swap_file
, page
, PAGE_SIZE
);
450 len
+= sprintf(buf
+ len
, "%-31s ", path
);
452 if (!ptr
->swap_device
)
453 len
+= sprintf(buf
+ len
, "file\t\t");
455 len
+= sprintf(buf
+ len
, "partition\t");
458 for (j
= 0; j
< ptr
->max
; ++j
)
459 switch (ptr
->swap_map
[j
]) {
466 len
+= sprintf(buf
+ len
, "%d\t%d\t%d\n", ptr
->pages
<< (PAGE_SHIFT
- 10),
467 usedswap
<< (PAGE_SHIFT
- 10), ptr
->prio
);
470 free_page((unsigned long) page
);
475 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
477 * The swapon system call
479 asmlinkage
int sys_swapon(const char * specialfile
, int swap_flags
)
481 struct swap_info_struct
* p
;
482 struct dentry
* swap_dentry
;
487 static int least_priority
= 0;
488 union swap_header
*swap_header
= 0;
489 int swap_header_version
;
490 int lock_map_size
= PAGE_SIZE
;
491 int nr_good_pages
= 0;
492 char tmp_lock_map
= 0;
495 if (!capable(CAP_SYS_ADMIN
))
497 memset(&filp
, 0, sizeof(filp
));
499 for (type
= 0 ; type
< nr_swapfiles
; type
++,p
++)
500 if (!(p
->flags
& SWP_USED
))
502 if (type
>= MAX_SWAPFILES
)
504 if (type
>= nr_swapfiles
)
505 nr_swapfiles
= type
+1;
510 p
->swap_lockmap
= NULL
;
516 if (swap_flags
& SWAP_FLAG_PREFER
) {
518 (swap_flags
& SWAP_FLAG_PRIO_MASK
)>>SWAP_FLAG_PRIO_SHIFT
;
520 p
->prio
= --least_priority
;
522 swap_dentry
= namei(specialfile
);
523 error
= PTR_ERR(swap_dentry
);
524 if (IS_ERR(swap_dentry
))
527 p
->swap_file
= swap_dentry
;
530 if (S_ISBLK(swap_dentry
->d_inode
->i_mode
)) {
531 p
->swap_device
= swap_dentry
->d_inode
->i_rdev
;
532 set_blocksize(p
->swap_device
, PAGE_SIZE
);
534 filp
.f_dentry
= swap_dentry
;
535 filp
.f_mode
= 3; /* read write */
536 error
= blkdev_open(swap_dentry
->d_inode
, &filp
);
539 set_blocksize(p
->swap_device
, PAGE_SIZE
);
541 if (!p
->swap_device
||
542 (blk_size
[MAJOR(p
->swap_device
)] &&
543 !blk_size
[MAJOR(p
->swap_device
)][MINOR(p
->swap_device
)]))
546 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
549 if (p
->swap_device
== swap_info
[i
].swap_device
)
552 } else if (!S_ISREG(swap_dentry
->d_inode
->i_mode
))
554 swap_header
= (void *) __get_free_page(GFP_USER
);
556 printk("Unable to start swapping: out of memory :-)\n");
561 p
->swap_lockmap
= &tmp_lock_map
;
562 rw_swap_page_nocache(READ
, SWP_ENTRY(type
,0), (char *) swap_header
);
565 if (!memcmp("SWAP-SPACE",swap_header
->magic
.magic
,10))
566 swap_header_version
= 1;
567 else if (!memcmp("SWAPSPACE2",swap_header
->magic
.magic
,10))
568 swap_header_version
= 2;
570 printk("Unable to find swap-space signature\n");
575 switch (swap_header_version
) {
577 memset(((char *) swap_header
)+PAGE_SIZE
-10,0,10);
581 for (i
= 1 ; i
< 8*PAGE_SIZE
; i
++) {
582 if (test_bit(i
,(char *) swap_header
)) {
591 p
->swap_map
= vmalloc(p
->max
* sizeof(short));
596 for (i
= 1 ; i
< p
->max
; i
++) {
597 if (test_bit(i
,(char *) swap_header
))
600 p
->swap_map
[i
] = SWAP_MAP_BAD
;
605 /* Check the swap header's sub-version and the size of
606 the swap file and bad block lists */
607 if (swap_header
->info
.version
!= 1) {
609 "Unable to handle swap header version %d\n",
610 swap_header
->info
.version
);
616 p
->highest_bit
= swap_header
->info
.last_page
- 1;
617 p
->max
= swap_header
->info
.last_page
;
619 if (p
->max
>= 0x7fffffffL
/PAGE_SIZE
||
620 (void *) &swap_header
->info
.badpages
[swap_header
->info
.nr_badpages
-1] >= (void *) swap_header
->magic
.magic
) {
625 /* OK, set up the swap map and apply the bad block list */
626 if (!(p
->swap_map
= vmalloc (p
->max
* sizeof(short)))) {
632 memset(p
->swap_map
, 0, p
->max
* sizeof(short));
633 for (i
=0; i
<swap_header
->info
.nr_badpages
; i
++) {
634 int page
= swap_header
->info
.badpages
[i
];
635 if (page
<= 0 || page
>= swap_header
->info
.last_page
)
638 p
->swap_map
[page
] = SWAP_MAP_BAD
;
640 nr_good_pages
= swap_header
->info
.last_page
- i
;
641 lock_map_size
= (p
->max
+ 7) / 8;
646 if (!nr_good_pages
) {
647 printk(KERN_WARNING
"Empty swap-file\n");
651 p
->swap_map
[0] = SWAP_MAP_BAD
;
652 if (!(p
->swap_lockmap
= vmalloc (lock_map_size
))) {
656 memset(p
->swap_lockmap
,0,lock_map_size
);
657 p
->flags
= SWP_WRITEOK
;
658 p
->pages
= nr_good_pages
;
659 nr_swap_pages
+= nr_good_pages
;
660 printk(KERN_INFO
"Adding Swap: %dk swap-space (priority %d)\n",
661 nr_good_pages
<<(PAGE_SHIFT
-10), p
->prio
);
663 /* insert swap space into swap_list: */
665 for (i
= swap_list
.head
; i
>= 0; i
= swap_info
[i
].next
) {
666 if (p
->prio
>= swap_info
[i
].prio
) {
673 swap_list
.head
= swap_list
.next
= p
- swap_info
;
675 swap_info
[prev
].next
= p
- swap_info
;
680 if(filp
.f_op
&& filp
.f_op
->release
)
681 filp
.f_op
->release(filp
.f_dentry
->d_inode
,&filp
);
684 vfree(p
->swap_lockmap
);
691 p
->swap_lockmap
= NULL
;
695 free_page((long) swap_header
);
700 void si_swapinfo(struct sysinfo
*val
)
704 val
->freeswap
= val
->totalswap
= 0;
705 for (i
= 0; i
< nr_swapfiles
; i
++) {
706 if ((swap_info
[i
].flags
& SWP_WRITEOK
) != SWP_WRITEOK
)
708 for (j
= 0; j
< swap_info
[i
].max
; ++j
)
709 switch (swap_info
[i
].swap_map
[j
]) {
718 val
->freeswap
<<= PAGE_SHIFT
;
719 val
->totalswap
<<= PAGE_SHIFT
;