4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
8 #include <linux/malloc.h>
9 #include <linux/smp_lock.h>
10 #include <linux/kernel_stat.h>
11 #include <linux/swap.h>
12 #include <linux/swapctl.h>
13 #include <linux/blkdev.h> /* for blk_size */
14 #include <linux/vmalloc.h>
15 #include <linux/pagemap.h>
16 #include <linux/shm.h>
18 #include <asm/pgtable.h>
20 unsigned int nr_swapfiles
= 0;
22 struct swap_list_t swap_list
= {-1, -1};
24 struct swap_info_struct swap_info
[MAX_SWAPFILES
];
26 #define SWAPFILE_CLUSTER 256
28 static inline int scan_swap_map(struct swap_info_struct
*si
)
32 * We try to cluster swap pages by allocating them
33 * sequentially in swap. Once we've allocated
34 * SWAPFILE_CLUSTER pages this way, however, we resort to
35 * first-free allocation, starting a new cluster. This
36 * prevents us from scattering swap pages all over the entire
37 * swap partition, so that we reduce overall disk seek times
38 * between swap pages. -- sct */
40 while (si
->cluster_next
<= si
->highest_bit
) {
41 offset
= si
->cluster_next
++;
42 if (si
->swap_map
[offset
])
44 if (test_bit(offset
, si
->swap_lockmap
))
50 si
->cluster_nr
= SWAPFILE_CLUSTER
;
51 for (offset
= si
->lowest_bit
; offset
<= si
->highest_bit
; offset
++) {
52 if (si
->swap_map
[offset
])
54 if (test_bit(offset
, si
->swap_lockmap
))
56 si
->lowest_bit
= offset
;
58 si
->swap_map
[offset
] = 1;
60 if (offset
== si
->highest_bit
)
62 si
->cluster_next
= offset
;
68 unsigned long get_swap_page(void)
70 struct swap_info_struct
* p
;
71 unsigned long offset
, entry
;
72 int type
, wrapped
= 0;
74 type
= swap_list
.next
;
77 if (nr_swap_pages
== 0)
82 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
83 offset
= scan_swap_map(p
);
85 entry
= SWP_ENTRY(type
,offset
);
86 type
= swap_info
[type
].next
;
88 p
->prio
!= swap_info
[type
].prio
)
90 swap_list
.next
= swap_list
.head
;
94 swap_list
.next
= type
;
101 if (type
< 0 || p
->prio
!= swap_info
[type
].prio
) {
102 type
= swap_list
.head
;
105 } else if (type
< 0) {
106 return 0; /* out of swap space */
112 void swap_free(unsigned long entry
)
114 struct swap_info_struct
* p
;
115 unsigned long offset
, type
;
120 type
= SWP_TYPE(entry
);
121 if (type
& SHM_SWP_TYPE
)
123 if (type
>= nr_swapfiles
)
125 p
= & swap_info
[type
];
126 if (!(p
->flags
& SWP_USED
))
128 if (p
->prio
> swap_info
[swap_list
.next
].prio
)
129 swap_list
.next
= swap_list
.head
;
130 offset
= SWP_OFFSET(entry
);
131 if (offset
>= p
->max
)
133 if (offset
< p
->lowest_bit
)
134 p
->lowest_bit
= offset
;
135 if (offset
> p
->highest_bit
)
136 p
->highest_bit
= offset
;
137 if (!p
->swap_map
[offset
])
139 if (p
->swap_map
[offset
] < SWAP_MAP_MAX
) {
140 if (!--p
->swap_map
[offset
])
144 printk("DebugVM: swap_free(entry %08lx, count now %d)\n",
145 entry
, p
->swap_map
[offset
]);
151 printk("swap_free: Trying to free nonexistent swap-page\n");
154 printk("swap_free: Trying to free swap from unused swap-device\n");
157 printk("swap_free: offset exceeds max\n");
160 printk("swap_free: swap-space map bad (entry %08lx)\n",entry
);
165 * The swap entry has been read in advance, and we return 1 to indicate
166 * that the page has been used or is no longer needed.
168 * Always set the resulting pte to be nowrite (the same as COW pages
169 * after one process has exited). We don't know just how many PTEs will
170 * share this swap entry, so be cautious and let do_wp_page work out
171 * what to do if a write is requested later.
173 static inline void unuse_pte(struct vm_area_struct
* vma
, unsigned long address
,
174 pte_t
*dir
, unsigned long entry
, unsigned long page
)
180 if (pte_present(pte
)) {
181 /* If this entry is swap-cached, then page must already
182 hold the right address for any copies in physical
184 if (pte_page(pte
) != page
)
186 /* We will be removing the swap cache in a moment, so... */
187 set_pte(dir
, pte_mkdirty(pte
));
190 if (pte_val(pte
) != entry
)
192 set_pte(dir
, pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
194 atomic_inc(&mem_map
[MAP_NR(page
)].count
);
198 static inline void unuse_pmd(struct vm_area_struct
* vma
, pmd_t
*dir
,
199 unsigned long address
, unsigned long size
, unsigned long offset
,
200 unsigned long entry
, unsigned long page
)
208 printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir
));
212 pte
= pte_offset(dir
, address
);
213 offset
+= address
& PMD_MASK
;
214 address
&= ~PMD_MASK
;
215 end
= address
+ size
;
219 unuse_pte(vma
, offset
+address
-vma
->vm_start
, pte
, entry
, page
);
220 address
+= PAGE_SIZE
;
222 } while (address
< end
);
225 static inline void unuse_pgd(struct vm_area_struct
* vma
, pgd_t
*dir
,
226 unsigned long address
, unsigned long size
,
227 unsigned long entry
, unsigned long page
)
230 unsigned long offset
, end
;
235 printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir
));
239 pmd
= pmd_offset(dir
, address
);
240 offset
= address
& PGDIR_MASK
;
241 address
&= ~PGDIR_MASK
;
242 end
= address
+ size
;
243 if (end
> PGDIR_SIZE
)
246 unuse_pmd(vma
, pmd
, address
, end
- address
, offset
, entry
,
248 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
250 } while (address
< end
);
253 static void unuse_vma(struct vm_area_struct
* vma
, pgd_t
*pgdir
,
254 unsigned long entry
, unsigned long page
)
256 unsigned long start
= vma
->vm_start
, end
= vma
->vm_end
;
258 while (start
< end
) {
259 unuse_pgd(vma
, pgdir
, start
, end
- start
, entry
, page
);
260 start
= (start
+ PGDIR_SIZE
) & PGDIR_MASK
;
265 static void unuse_process(struct mm_struct
* mm
, unsigned long entry
,
268 struct vm_area_struct
* vma
;
271 * Go through process' page directory.
273 if (!mm
|| mm
== &init_mm
)
275 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
276 pgd_t
* pgd
= pgd_offset(mm
, vma
->vm_start
);
277 unuse_vma(vma
, pgd
, entry
, page
);
283 * We completely avoid races by reading each swap page in advance,
284 * and then search for the process using it. All the necessary
285 * page table adjustments can then be made atomically.
287 static int try_to_unuse(unsigned int type
)
289 struct swap_info_struct
* si
= &swap_info
[type
];
290 struct task_struct
*p
;
291 struct page
*page_map
;
292 unsigned long entry
, page
;
297 * Find a swap page in use and read it in.
299 for (i
= 1; i
< si
->max
; i
++) {
300 if (si
->swap_map
[i
] > 0 && si
->swap_map
[i
] != SWAP_MAP_BAD
) {
307 entry
= SWP_ENTRY(type
, i
);
309 /* Get a page for the entry, using the existing swap
310 cache page if there is one. Otherwise, get a clean
311 page and read the swap into it. */
312 page_map
= read_swap_cache(entry
);
315 * Continue searching if the entry became unused.
317 if (si
->swap_map
[i
] == 0)
321 page
= page_address(page_map
);
322 read_lock(&tasklist_lock
);
324 unuse_process(p
->mm
, entry
, page
);
325 read_unlock(&tasklist_lock
);
326 shm_unuse(entry
, page
);
327 /* Now get rid of the extra reference to the temporary
328 page we've been using. */
329 if (PageSwapCache(page_map
))
330 delete_from_swap_cache(page_map
);
331 __free_page(page_map
);
333 * Check for and clear any overflowed swap map counts.
335 if (si
->swap_map
[i
] != 0) {
336 if (si
->swap_map
[i
] != SWAP_MAP_MAX
)
338 "try_to_unuse: entry %08lx count=%d\n",
339 entry
, si
->swap_map
[i
]);
347 asmlinkage
int sys_swapoff(const char * specialfile
)
349 struct swap_info_struct
* p
= NULL
;
350 struct dentry
* dentry
;
356 if (!capable(CAP_SYS_ADMIN
))
359 dentry
= namei(specialfile
);
360 err
= PTR_ERR(dentry
);
365 for (type
= swap_list
.head
; type
>= 0; type
= swap_info
[type
].next
) {
366 p
= swap_info
+ type
;
367 if ((p
->flags
& SWP_WRITEOK
) == SWP_WRITEOK
) {
369 if (p
->swap_file
== dentry
)
372 if (S_ISBLK(dentry
->d_inode
->i_mode
)
373 && (p
->swap_device
== dentry
->d_inode
->i_rdev
))
384 swap_list
.head
= p
->next
;
386 swap_info
[prev
].next
= p
->next
;
388 if (type
== swap_list
.next
) {
389 /* just pick something that's safe... */
390 swap_list
.next
= swap_list
.head
;
393 err
= try_to_unuse(type
);
395 /* re-insert swap space back into swap_list */
396 for (prev
= -1, i
= swap_list
.head
; i
>= 0; prev
= i
, i
= swap_info
[i
].next
)
397 if (p
->prio
>= swap_info
[i
].prio
)
401 swap_list
.head
= swap_list
.next
= p
- swap_info
;
403 swap_info
[prev
].next
= p
- swap_info
;
404 p
->flags
= SWP_WRITEOK
;
408 memset(&filp
, 0, sizeof(filp
));
409 filp
.f_dentry
= dentry
;
410 filp
.f_mode
= 3; /* read write */
411 /* open it again to get fops */
412 if( !blkdev_open(dentry
->d_inode
, &filp
) &&
413 filp
.f_op
&& filp
.f_op
->release
){
414 filp
.f_op
->release(dentry
->d_inode
,&filp
);
415 filp
.f_op
->release(dentry
->d_inode
,&filp
);
420 dentry
= p
->swap_file
;
422 nr_swap_pages
-= p
->pages
;
426 vfree(p
->swap_lockmap
);
427 p
->swap_lockmap
= NULL
;
438 int get_swaparea_info(char *buf
)
440 char * page
= (char *) __get_free_page(GFP_KERNEL
);
441 struct swap_info_struct
*ptr
= swap_info
;
442 int i
, j
, len
= 0, usedswap
;
447 len
+= sprintf(buf
, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n");
448 for (i
= 0 ; i
< nr_swapfiles
; i
++, ptr
++) {
449 if (ptr
->flags
& SWP_USED
) {
450 char * path
= d_path(ptr
->swap_file
, page
, PAGE_SIZE
);
452 len
+= sprintf(buf
+ len
, "%-31s ", path
);
454 if (!ptr
->swap_device
)
455 len
+= sprintf(buf
+ len
, "file\t\t");
457 len
+= sprintf(buf
+ len
, "partition\t");
460 for (j
= 0; j
< ptr
->max
; ++j
)
461 switch (ptr
->swap_map
[j
]) {
468 len
+= sprintf(buf
+ len
, "%d\t%d\t%d\n", ptr
->pages
<< (PAGE_SHIFT
- 10),
469 usedswap
<< (PAGE_SHIFT
- 10), ptr
->prio
);
472 free_page((unsigned long) page
);
477 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
479 * The swapon system call
481 asmlinkage
int sys_swapon(const char * specialfile
, int swap_flags
)
483 struct swap_info_struct
* p
;
484 struct dentry
* swap_dentry
;
489 static int least_priority
= 0;
490 union swap_header
*swap_header
= 0;
491 int swap_header_version
;
492 int lock_map_size
= PAGE_SIZE
;
493 int nr_good_pages
= 0;
494 unsigned long tmp_lock_map
= 0;
497 if (!capable(CAP_SYS_ADMIN
))
499 memset(&filp
, 0, sizeof(filp
));
501 for (type
= 0 ; type
< nr_swapfiles
; type
++,p
++)
502 if (!(p
->flags
& SWP_USED
))
504 if (type
>= MAX_SWAPFILES
)
506 if (type
>= nr_swapfiles
)
507 nr_swapfiles
= type
+1;
512 p
->swap_lockmap
= NULL
;
518 if (swap_flags
& SWAP_FLAG_PREFER
) {
520 (swap_flags
& SWAP_FLAG_PRIO_MASK
)>>SWAP_FLAG_PRIO_SHIFT
;
522 p
->prio
= --least_priority
;
524 swap_dentry
= namei(specialfile
);
525 error
= PTR_ERR(swap_dentry
);
526 if (IS_ERR(swap_dentry
))
529 p
->swap_file
= swap_dentry
;
532 if (S_ISBLK(swap_dentry
->d_inode
->i_mode
)) {
533 p
->swap_device
= swap_dentry
->d_inode
->i_rdev
;
534 set_blocksize(p
->swap_device
, PAGE_SIZE
);
536 filp
.f_dentry
= swap_dentry
;
537 filp
.f_mode
= 3; /* read write */
538 error
= blkdev_open(swap_dentry
->d_inode
, &filp
);
541 set_blocksize(p
->swap_device
, PAGE_SIZE
);
543 if (!p
->swap_device
||
544 (blk_size
[MAJOR(p
->swap_device
)] &&
545 !blk_size
[MAJOR(p
->swap_device
)][MINOR(p
->swap_device
)]))
548 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
551 if (p
->swap_device
== swap_info
[i
].swap_device
)
554 } else if (S_ISREG(swap_dentry
->d_inode
->i_mode
)) {
556 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
559 if (swap_dentry
->d_inode
== swap_info
[i
].swap_file
->d_inode
)
565 swap_header
= (void *) __get_free_page(GFP_USER
);
567 printk("Unable to start swapping: out of memory :-)\n");
572 p
->swap_lockmap
= (char *) &tmp_lock_map
;
573 rw_swap_page_nocache(READ
, SWP_ENTRY(type
,0), (char *) swap_header
);
574 p
->swap_lockmap
= NULL
;
576 if (!memcmp("SWAP-SPACE",swap_header
->magic
.magic
,10))
577 swap_header_version
= 1;
578 else if (!memcmp("SWAPSPACE2",swap_header
->magic
.magic
,10))
579 swap_header_version
= 2;
581 printk("Unable to find swap-space signature\n");
586 switch (swap_header_version
) {
588 memset(((char *) swap_header
)+PAGE_SIZE
-10,0,10);
592 for (i
= 1 ; i
< 8*PAGE_SIZE
; i
++) {
593 if (test_bit(i
,(char *) swap_header
)) {
602 p
->swap_map
= vmalloc(p
->max
* sizeof(short));
607 for (i
= 1 ; i
< p
->max
; i
++) {
608 if (test_bit(i
,(char *) swap_header
))
611 p
->swap_map
[i
] = SWAP_MAP_BAD
;
616 /* Check the swap header's sub-version and the size of
617 the swap file and bad block lists */
618 if (swap_header
->info
.version
!= 1) {
620 "Unable to handle swap header version %d\n",
621 swap_header
->info
.version
);
627 p
->highest_bit
= swap_header
->info
.last_page
- 1;
628 p
->max
= swap_header
->info
.last_page
;
631 if (swap_header
->info
.nr_badpages
> MAX_SWAP_BADPAGES
)
633 if (p
->max
>= SWP_OFFSET(SWP_ENTRY(0,~0UL)))
636 /* OK, set up the swap map and apply the bad block list */
637 if (!(p
->swap_map
= vmalloc (p
->max
* sizeof(short)))) {
643 memset(p
->swap_map
, 0, p
->max
* sizeof(short));
644 for (i
=0; i
<swap_header
->info
.nr_badpages
; i
++) {
645 int page
= swap_header
->info
.badpages
[i
];
646 if (page
<= 0 || page
>= swap_header
->info
.last_page
)
649 p
->swap_map
[page
] = SWAP_MAP_BAD
;
651 nr_good_pages
= swap_header
->info
.last_page
- i
;
652 lock_map_size
= (p
->max
+ 7) / 8;
657 if (!nr_good_pages
) {
658 printk(KERN_WARNING
"Empty swap-file\n");
662 p
->swap_map
[0] = SWAP_MAP_BAD
;
663 if (!(p
->swap_lockmap
= vmalloc (lock_map_size
))) {
667 memset(p
->swap_lockmap
,0,lock_map_size
);
668 p
->flags
= SWP_WRITEOK
;
669 p
->pages
= nr_good_pages
;
670 nr_swap_pages
+= nr_good_pages
;
671 printk(KERN_INFO
"Adding Swap: %dk swap-space (priority %d)\n",
672 nr_good_pages
<<(PAGE_SHIFT
-10), p
->prio
);
674 /* insert swap space into swap_list: */
676 for (i
= swap_list
.head
; i
>= 0; i
= swap_info
[i
].next
) {
677 if (p
->prio
>= swap_info
[i
].prio
) {
684 swap_list
.head
= swap_list
.next
= p
- swap_info
;
686 swap_info
[prev
].next
= p
- swap_info
;
691 if(filp
.f_op
&& filp
.f_op
->release
)
692 filp
.f_op
->release(filp
.f_dentry
->d_inode
,&filp
);
695 vfree(p
->swap_lockmap
);
702 p
->swap_lockmap
= NULL
;
704 if (!(swap_flags
& SWAP_FLAG_PREFER
))
708 free_page((long) swap_header
);
713 void si_swapinfo(struct sysinfo
*val
)
717 val
->freeswap
= val
->totalswap
= 0;
718 for (i
= 0; i
< nr_swapfiles
; i
++) {
719 if ((swap_info
[i
].flags
& SWP_WRITEOK
) != SWP_WRITEOK
)
721 for (j
= 0; j
< swap_info
[i
].max
; ++j
)
722 switch (swap_info
[i
].swap_map
[j
]) {
731 val
->freeswap
<<= PAGE_SHIFT
;
732 val
->totalswap
<<= PAGE_SHIFT
;