4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie
8 #include <linux/config.h>
10 #include <linux/slab.h>
11 #include <linux/kernel_stat.h>
12 #include <linux/swap.h>
13 #include <linux/vmalloc.h>
14 #include <linux/pagemap.h>
15 #include <linux/namei.h>
16 #include <linux/shm.h>
17 #include <linux/blkdev.h>
18 #include <linux/buffer_head.h>
19 #include <linux/proc_fs.h>
20 #include <linux/seq_file.h>
21 #include <linux/init.h>
23 #include <asm/pgtable.h>
24 #include <linux/swapops.h>
26 spinlock_t swaplock
= SPIN_LOCK_UNLOCKED
;
27 unsigned int nr_swapfiles
;
29 static int swap_overflow
;
31 static const char Bad_file
[] = "Bad swap file entry ";
32 static const char Unused_file
[] = "Unused swap file entry ";
33 static const char Bad_offset
[] = "Bad swap offset entry ";
34 static const char Unused_offset
[] = "Unused swap offset entry ";
36 struct swap_list_t swap_list
= {-1, -1};
38 struct swap_info_struct swap_info
[MAX_SWAPFILES
];
40 #define SWAPFILE_CLUSTER 256
42 static inline int scan_swap_map(struct swap_info_struct
*si
)
46 * We try to cluster swap pages by allocating them
47 * sequentially in swap. Once we've allocated
48 * SWAPFILE_CLUSTER pages this way, however, we resort to
49 * first-free allocation, starting a new cluster. This
50 * prevents us from scattering swap pages all over the entire
51 * swap partition, so that we reduce overall disk seek times
52 * between swap pages. -- sct */
54 while (si
->cluster_next
<= si
->highest_bit
) {
55 offset
= si
->cluster_next
++;
56 if (si
->swap_map
[offset
])
62 si
->cluster_nr
= SWAPFILE_CLUSTER
;
64 /* try to find an empty (even not aligned) cluster. */
65 offset
= si
->lowest_bit
;
67 if (offset
+SWAPFILE_CLUSTER
-1 <= si
->highest_bit
)
70 for (nr
= offset
; nr
< offset
+SWAPFILE_CLUSTER
; nr
++)
74 goto check_next_cluster
;
76 /* We found a completly empty cluster, so start
81 /* No luck, so now go finegrined as usual. -Andrea */
82 for (offset
= si
->lowest_bit
; offset
<= si
->highest_bit
; offset
++) {
83 if (si
->swap_map
[offset
])
85 si
->lowest_bit
= offset
+1;
87 if (offset
== si
->lowest_bit
)
89 if (offset
== si
->highest_bit
)
91 if (si
->lowest_bit
> si
->highest_bit
) {
92 si
->lowest_bit
= si
->max
;
95 si
->swap_map
[offset
] = 1;
97 si
->cluster_next
= offset
+1;
100 si
->lowest_bit
= si
->max
;
105 swp_entry_t
get_swap_page(void)
107 struct swap_info_struct
* p
;
108 unsigned long offset
;
110 int type
, wrapped
= 0;
112 entry
.val
= 0; /* Out of memory */
114 type
= swap_list
.next
;
117 if (nr_swap_pages
<= 0)
121 p
= &swap_info
[type
];
122 if ((p
->flags
& SWP_ACTIVE
) == SWP_ACTIVE
) {
124 offset
= scan_swap_map(p
);
125 swap_device_unlock(p
);
127 entry
= swp_entry(type
,offset
);
128 type
= swap_info
[type
].next
;
130 p
->prio
!= swap_info
[type
].prio
) {
131 swap_list
.next
= swap_list
.head
;
133 swap_list
.next
= type
;
140 if (type
< 0 || p
->prio
!= swap_info
[type
].prio
) {
141 type
= swap_list
.head
;
146 goto out
; /* out of swap space */
153 static struct swap_info_struct
* swap_info_get(swp_entry_t entry
)
155 struct swap_info_struct
* p
;
156 unsigned long offset
, type
;
160 type
= swp_type(entry
);
161 if (type
>= nr_swapfiles
)
163 p
= & swap_info
[type
];
164 if (!(p
->flags
& SWP_USED
))
166 offset
= swp_offset(entry
);
167 if (offset
>= p
->max
)
169 if (!p
->swap_map
[offset
])
172 if (p
->prio
> swap_info
[swap_list
.next
].prio
)
173 swap_list
.next
= type
;
178 printk(KERN_ERR
"swap_free: %s%08lx\n", Unused_offset
, entry
.val
);
181 printk(KERN_ERR
"swap_free: %s%08lx\n", Bad_offset
, entry
.val
);
184 printk(KERN_ERR
"swap_free: %s%08lx\n", Unused_file
, entry
.val
);
187 printk(KERN_ERR
"swap_free: %s%08lx\n", Bad_file
, entry
.val
);
192 static void swap_info_put(struct swap_info_struct
* p
)
194 swap_device_unlock(p
);
198 static int swap_entry_free(struct swap_info_struct
*p
, unsigned long offset
)
200 int count
= p
->swap_map
[offset
];
202 if (count
< SWAP_MAP_MAX
) {
204 p
->swap_map
[offset
] = count
;
206 if (offset
< p
->lowest_bit
)
207 p
->lowest_bit
= offset
;
208 if (offset
> p
->highest_bit
)
209 p
->highest_bit
= offset
;
217 * Caller has made sure that the swapdevice corresponding to entry
218 * is still around or has not been recycled.
220 void swap_free(swp_entry_t entry
)
222 struct swap_info_struct
* p
;
224 p
= swap_info_get(entry
);
226 swap_entry_free(p
, swp_offset(entry
));
232 * Check if we're the only user of a swap page,
233 * when the page is locked.
235 static int exclusive_swap_page(struct page
*page
)
238 struct swap_info_struct
* p
;
241 entry
.val
= page
->index
;
242 p
= swap_info_get(entry
);
244 /* Is the only swap cache user the cache itself? */
245 if (p
->swap_map
[swp_offset(entry
)] == 1) {
246 /* Recheck the page count with the pagecache lock held.. */
247 read_lock(&swapper_space
.page_lock
);
248 if (page_count(page
) - !!PagePrivate(page
) == 2)
250 read_unlock(&swapper_space
.page_lock
);
258 * We can use this swap cache entry directly
259 * if there are no other references to it.
261 * Here "exclusive_swap_page()" does the real
262 * work, but we opportunistically check whether
263 * we need to get all the locks first..
265 int can_share_swap_page(struct page
*page
)
269 if (!PageLocked(page
))
271 switch (page_count(page
)) {
273 if (!PagePrivate(page
))
277 if (!PageSwapCache(page
))
279 retval
= exclusive_swap_page(page
);
282 if (PageReserved(page
))
290 * Work out if there are any other processes sharing this
291 * swap cache page. Free it if you can. Return success.
293 int remove_exclusive_swap_page(struct page
*page
)
296 struct swap_info_struct
* p
;
299 BUG_ON(page_has_buffers(page
));
300 BUG_ON(!PageLocked(page
));
302 if (!PageSwapCache(page
))
304 if (PageWriteback(page
))
306 if (page_count(page
) != 2) /* 2: us + cache */
309 entry
.val
= page
->index
;
310 p
= swap_info_get(entry
);
314 /* Is the only swap cache user the cache itself? */
316 if (p
->swap_map
[swp_offset(entry
)] == 1) {
317 /* Recheck the page count with the pagecache lock held.. */
318 write_lock(&swapper_space
.page_lock
);
319 if ((page_count(page
) == 2) && !PageWriteback(page
)) {
320 __delete_from_swap_cache(page
);
324 write_unlock(&swapper_space
.page_lock
);
330 page_cache_release(page
);
337 * Free the swap entry like above, but also try to
338 * free the page cache entry if it is the last user.
340 void free_swap_and_cache(swp_entry_t entry
)
342 struct swap_info_struct
* p
;
343 struct page
*page
= NULL
;
345 p
= swap_info_get(entry
);
347 if (swap_entry_free(p
, swp_offset(entry
)) == 1)
348 page
= find_trylock_page(&swapper_space
, entry
.val
);
354 BUG_ON(page_has_buffers(page
));
355 page_cache_get(page
);
356 one_user
= (page_count(page
) == 2);
357 /* Only cache user (+us), or swap space full? Free it! */
358 if (!PageWriteback(page
) && (one_user
|| vm_swap_full())) {
359 delete_from_swap_cache(page
);
363 page_cache_release(page
);
368 * The swap entry has been read in advance, and we return 1 to indicate
369 * that the page has been used or is no longer needed.
371 * Always set the resulting pte to be nowrite (the same as COW pages
372 * after one process has exited). We don't know just how many PTEs will
373 * share this swap entry, so be cautious and let do_wp_page work out
374 * what to do if a write is requested later.
376 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
377 static inline void unuse_pte(struct vm_area_struct
* vma
, unsigned long address
,
378 pte_t
*dir
, swp_entry_t entry
, struct page
* page
)
382 if (likely(pte_to_swp_entry(pte
).val
!= entry
.val
))
384 if (unlikely(pte_none(pte
) || pte_present(pte
)))
387 set_pte(dir
, pte_mkold(mk_pte(page
, vma
->vm_page_prot
)));
388 page_add_rmap(page
, dir
);
393 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
394 static void unuse_pmd(struct vm_area_struct
* vma
, pmd_t
*dir
,
395 unsigned long address
, unsigned long size
, unsigned long offset
,
396 swp_entry_t entry
, struct page
* page
)
408 pte
= pte_offset_map(dir
, address
);
409 offset
+= address
& PMD_MASK
;
410 address
&= ~PMD_MASK
;
411 end
= address
+ size
;
415 unuse_pte(vma
, offset
+address
-vma
->vm_start
, pte
, entry
, page
);
416 address
+= PAGE_SIZE
;
418 } while (address
&& (address
< end
));
422 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
423 static void unuse_pgd(struct vm_area_struct
* vma
, pgd_t
*dir
,
424 unsigned long address
, unsigned long size
,
425 swp_entry_t entry
, struct page
* page
)
428 unsigned long offset
, end
;
437 pmd
= pmd_offset(dir
, address
);
438 offset
= address
& PGDIR_MASK
;
439 address
&= ~PGDIR_MASK
;
440 end
= address
+ size
;
441 if (end
> PGDIR_SIZE
)
446 unuse_pmd(vma
, pmd
, address
, end
- address
, offset
, entry
,
448 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
450 } while (address
&& (address
< end
));
453 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
454 static void unuse_vma(struct vm_area_struct
* vma
, pgd_t
*pgdir
,
455 swp_entry_t entry
, struct page
* page
)
457 unsigned long start
= vma
->vm_start
, end
= vma
->vm_end
;
462 unuse_pgd(vma
, pgdir
, start
, end
- start
, entry
, page
);
463 start
= (start
+ PGDIR_SIZE
) & PGDIR_MASK
;
465 } while (start
&& (start
< end
));
468 static void unuse_process(struct mm_struct
* mm
,
469 swp_entry_t entry
, struct page
* page
)
471 struct vm_area_struct
* vma
;
474 * Go through process' page directory.
476 spin_lock(&mm
->page_table_lock
);
477 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
478 pgd_t
* pgd
= pgd_offset(mm
, vma
->vm_start
);
479 unuse_vma(vma
, pgd
, entry
, page
);
481 spin_unlock(&mm
->page_table_lock
);
486 * Scan swap_map from current position to next entry still in use.
487 * Recycle to start on reaching the end, returning 0 when empty.
489 static int find_next_to_unuse(struct swap_info_struct
*si
, int prev
)
496 * No need for swap_device_lock(si) here: we're just looking
497 * for whether an entry is in use, not modifying it; false
498 * hits are okay, and sys_swapoff() has already prevented new
499 * allocations from this area (while holding swap_list_lock()).
508 * No entries in use at top of swap_map,
509 * loop back to start and recheck there.
515 count
= si
->swap_map
[i
];
516 if (count
&& count
!= SWAP_MAP_BAD
)
523 * We completely avoid races by reading each swap page in advance,
524 * and then search for the process using it. All the necessary
525 * page table adjustments can then be made atomically.
527 static int try_to_unuse(unsigned int type
)
529 struct swap_info_struct
* si
= &swap_info
[type
];
530 struct mm_struct
*start_mm
;
531 unsigned short *swap_map
;
532 unsigned short swcount
;
537 int reset_overflow
= 0;
541 * When searching mms for an entry, a good strategy is to
542 * start at the first mm we freed the previous entry from
543 * (though actually we don't notice whether we or coincidence
544 * freed the entry). Initialize this start_mm with a hold.
546 * A simpler strategy would be to start at the last mm we
547 * freed the previous entry from; but that would take less
548 * advantage of mmlist ordering (now preserved by swap_out()),
549 * which clusters forked address spaces together, most recent
550 * child immediately after parent. If we race with dup_mmap(),
551 * we very much want to resolve parent before child, otherwise
552 * we may miss some entries: using last mm would invert that.
555 atomic_inc(&init_mm
.mm_users
);
558 * Keep on scanning until all entries have gone. Usually,
559 * one pass through swap_map is enough, but not necessarily:
560 * mmput() removes mm from mmlist before exit_mmap() and its
561 * zap_page_range(). That's not too bad, those entries are
562 * on their way out, and handled faster there than here.
563 * do_munmap() behaves similarly, taking the range out of mm's
564 * vma list before zap_page_range(). But unfortunately, when
565 * unmapping a part of a vma, it takes the whole out first,
566 * then reinserts what's left after (might even reschedule if
567 * open() method called) - so swap entries may be invisible
568 * to swapoff for a while, then reappear - but that is rare.
570 while ((i
= find_next_to_unuse(si
, i
))) {
572 * Get a page for the entry, using the existing swap
573 * cache page if there is one. Otherwise, get a clean
574 * page and read the swap into it.
576 swap_map
= &si
->swap_map
[i
];
577 entry
= swp_entry(type
, i
);
578 page
= read_swap_cache_async(entry
);
581 * Either swap_duplicate() failed because entry
582 * has been freed independently, and will not be
583 * reused since sys_swapoff() already disabled
584 * allocation from here, or alloc_page() failed.
593 * Don't hold on to start_mm if it looks like exiting.
595 if (atomic_read(&start_mm
->mm_users
) == 1) {
598 atomic_inc(&init_mm
.mm_users
);
602 * Wait for and lock page. When do_swap_page races with
603 * try_to_unuse, do_swap_page can handle the fault much
604 * faster than try_to_unuse can locate the entry. This
605 * apparently redundant "wait_on_page_locked" lets try_to_unuse
606 * defer to do_swap_page in such a case - in some tests,
607 * do_swap_page and try_to_unuse repeatedly compete.
609 wait_on_page_locked(page
);
610 wait_on_page_writeback(page
);
612 wait_on_page_writeback(page
);
615 * Remove all references to entry, without blocking.
616 * Whenever we reach init_mm, there's no address space
617 * to search, but use it as a reminder to search shmem.
622 flush_page_to_ram(page
);
623 if (start_mm
== &init_mm
)
624 shmem
= shmem_unuse(entry
, page
);
626 unuse_process(start_mm
, entry
, page
);
629 int set_start_mm
= (*swap_map
>= swcount
);
630 struct list_head
*p
= &start_mm
->mmlist
;
631 struct mm_struct
*new_start_mm
= start_mm
;
632 struct mm_struct
*mm
;
634 spin_lock(&mmlist_lock
);
635 while (*swap_map
> 1 &&
636 (p
= p
->next
) != &start_mm
->mmlist
) {
637 mm
= list_entry(p
, struct mm_struct
, mmlist
);
639 if (mm
== &init_mm
) {
641 spin_unlock(&mmlist_lock
);
642 shmem
= shmem_unuse(entry
, page
);
643 spin_lock(&mmlist_lock
);
645 unuse_process(mm
, entry
, page
);
646 if (set_start_mm
&& *swap_map
< swcount
) {
651 atomic_inc(&new_start_mm
->mm_users
);
652 spin_unlock(&mmlist_lock
);
654 start_mm
= new_start_mm
;
658 * How could swap count reach 0x7fff when the maximum
659 * pid is 0x7fff, and there's no way to repeat a swap
660 * page within an mm (except in shmem, where it's the
661 * shared object which takes the reference count)?
662 * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
664 * If that's wrong, then we should worry more about
665 * exit_mmap() and do_munmap() cases described above:
666 * we might be resetting SWAP_MAP_MAX too early here.
667 * We know "Undead"s can happen, they're okay, so don't
668 * report them; but do report if we reset SWAP_MAP_MAX.
670 if (*swap_map
== SWAP_MAP_MAX
) {
672 swap_device_lock(si
);
675 swap_device_unlock(si
);
681 * If a reference remains (rare), we would like to leave
682 * the page in the swap cache; but try_to_swap_out could
683 * then re-duplicate the entry once we drop page lock,
684 * so we might loop indefinitely; also, that page could
685 * not be swapped out to other storage meanwhile. So:
686 * delete from cache even if there's another reference,
687 * after ensuring that the data has been saved to disk -
688 * since if the reference remains (rarer), it will be
689 * read from disk into another page. Splitting into two
690 * pages would be incorrect if swap supported "shared
691 * private" pages, but they are handled by tmpfs files.
693 * Note shmem_unuse already deleted a swappage from
694 * the swap cache, unless the move to filepage failed:
695 * in which case it left swappage in cache, lowered its
696 * swap count to pass quickly through the loops above,
697 * and now we must reincrement count to try again later.
699 if ((*swap_map
> 1) && PageDirty(page
) && PageSwapCache(page
)) {
700 swap_writepage(page
);
702 wait_on_page_writeback(page
);
704 if (PageSwapCache(page
)) {
706 swap_duplicate(entry
);
708 delete_from_swap_cache(page
);
712 * So we could skip searching mms once swap count went
713 * to 1, we did not mark any present ptes as dirty: must
714 * mark page dirty so try_to_swap_out will preserve it.
718 page_cache_release(page
);
721 * Make sure that we aren't completely killing
722 * interactive performance. Interruptible check on
723 * signal_pending() would be nice, but changes the spec?
730 if (reset_overflow
) {
731 printk(KERN_WARNING
"swapoff: cleared swap entry overflow\n");
738 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
739 * corresponds to page offset `offset'.
741 sector_t
map_swap_page(struct swap_info_struct
*sis
, pgoff_t offset
)
743 struct swap_extent
*se
= sis
->curr_swap_extent
;
744 struct swap_extent
*start_se
= se
;
747 struct list_head
*lh
;
749 if (se
->start_page
<= offset
&&
750 offset
< (se
->start_page
+ se
->nr_pages
)) {
751 return se
->start_block
+ (offset
- se
->start_page
);
754 if (lh
== &sis
->extent_list
)
756 se
= list_entry(lh
, struct swap_extent
, list
);
757 sis
->curr_swap_extent
= se
;
758 BUG_ON(se
== start_se
); /* It *must* be present */
763 * Free all of a swapdev's extent information
765 static void destroy_swap_extents(struct swap_info_struct
*sis
)
767 while (!list_empty(&sis
->extent_list
)) {
768 struct swap_extent
*se
;
770 se
= list_entry(sis
->extent_list
.next
,
771 struct swap_extent
, list
);
779 * Add a block range (and the corresponding page range) into this swapdev's
780 * extent list. The extent list is kept sorted in block order.
782 * This function rather assumes that it is called in ascending sector_t order.
783 * It doesn't look for extent coalescing opportunities.
786 add_swap_extent(struct swap_info_struct
*sis
, unsigned long start_page
,
787 unsigned long nr_pages
, sector_t start_block
)
789 struct swap_extent
*se
;
790 struct swap_extent
*new_se
;
791 struct list_head
*lh
;
793 lh
= sis
->extent_list
.next
; /* The highest-addressed block */
794 while (lh
!= &sis
->extent_list
) {
795 se
= list_entry(lh
, struct swap_extent
, list
);
796 if (se
->start_block
+ se
->nr_pages
== start_block
) {
798 se
->nr_pages
+= nr_pages
;
805 * No merge. Insert a new extent, preserving ordering.
807 new_se
= kmalloc(sizeof(*se
), GFP_KERNEL
);
810 new_se
->start_page
= start_page
;
811 new_se
->nr_pages
= nr_pages
;
812 new_se
->start_block
= start_block
;
814 lh
= sis
->extent_list
.prev
; /* The lowest block */
815 while (lh
!= &sis
->extent_list
) {
816 se
= list_entry(lh
, struct swap_extent
, list
);
817 if (se
->start_block
> start_block
)
821 list_add_tail(&new_se
->list
, lh
);
827 * A `swap extent' is a simple thing which maps a contiguous range of pages
828 * onto a contiguous range of disk blocks. An ordered list of swap extents
829 * is built at swapon time and is then used at swap_writepage/swap_readpage
830 * time for locating where on disk a page belongs.
832 * If the swapfile is an S_ISBLK block device, a single extent is installed.
833 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
834 * swap files identically.
836 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
837 * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
838 * swapfiles are handled *identically* after swapon time.
840 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
841 * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
842 * some stray blocks are found which do not fall within the PAGE_SIZE alignment
843 * requirements, they are simply tossed out - we will never use those blocks
846 * The amount of disk space which a single swap extent represents varies.
847 * Typically it is in the 1-4 megabyte range. So we can have hundreds of
848 * extents in the list. To avoid much list walking, we cache the previous
849 * search location in `curr_swap_extent', and start new searches from there.
850 * This is extremely effective. The average number of iterations in
851 * map_swap_page() has been measured at about 0.3 per page. - akpm.
853 static int setup_swap_extents(struct swap_info_struct
*sis
)
856 unsigned blocks_per_page
;
857 unsigned long page_no
;
859 sector_t probe_block
;
863 inode
= sis
->swap_file
->f_dentry
->d_inode
;
864 if (S_ISBLK(inode
->i_mode
)) {
865 ret
= add_swap_extent(sis
, 0, sis
->max
, 0);
869 blkbits
= inode
->i_blkbits
;
870 blocks_per_page
= PAGE_SIZE
>> blkbits
;
873 * Map all the blocks into the extent list. This code doesn't try
878 last_block
= inode
->i_size
>> blkbits
;
879 while ((probe_block
+ blocks_per_page
) <= last_block
&&
880 page_no
< sis
->max
) {
881 unsigned block_in_page
;
882 sector_t first_block
;
884 first_block
= bmap(inode
, probe_block
);
885 if (first_block
== 0)
889 * It must be PAGE_SIZE aligned on-disk
891 if (first_block
& (blocks_per_page
- 1)) {
896 for (block_in_page
= 1; block_in_page
< blocks_per_page
;
900 block
= bmap(inode
, probe_block
+ block_in_page
);
903 if (block
!= first_block
+ block_in_page
) {
911 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
913 ret
= add_swap_extent(sis
, page_no
, 1,
914 first_block
>> (PAGE_SHIFT
- blkbits
));
918 probe_block
+= blocks_per_page
;
926 sis
->highest_bit
= page_no
- 1;
928 sis
->curr_swap_extent
= list_entry(sis
->extent_list
.prev
,
929 struct swap_extent
, list
);
932 printk(KERN_ERR
"swapon: swapfile has holes\n");
938 #if 0 /* We don't need this yet */
939 #include <linux/backing-dev.h>
940 int page_queue_congested(struct page
*page
)
942 struct backing_dev_info
*bdi
;
944 BUG_ON(!PageLocked(page
)); /* It pins the swap_info_struct */
946 bdi
= page
->mapping
->backing_dev_info
;
947 if (PageSwapCache(page
)) {
948 swp_entry_t entry
= { .val
= page
->index
};
949 struct swap_info_struct
*sis
;
951 sis
= get_swap_info_struct(swp_type(entry
));
952 bdi
= sis
->bdev
->bd_inode
->i_mapping
->backing_dev_info
;
954 return bdi_write_congested(bdi
);
958 asmlinkage
long sys_swapoff(const char * specialfile
)
960 struct swap_info_struct
* p
= NULL
;
961 unsigned short *swap_map
;
962 struct file
*swap_file
, *victim
;
963 struct address_space
*mapping
;
967 if (!capable(CAP_SYS_ADMIN
))
970 victim
= filp_open(specialfile
, O_RDWR
, 0);
971 err
= PTR_ERR(victim
);
975 mapping
= victim
->f_dentry
->d_inode
->i_mapping
;
978 for (type
= swap_list
.head
; type
>= 0; type
= swap_info
[type
].next
) {
979 p
= swap_info
+ type
;
980 if ((p
->flags
& SWP_ACTIVE
) == SWP_ACTIVE
) {
981 if (p
->swap_file
->f_dentry
->d_inode
->i_mapping
==mapping
)
993 swap_list
.head
= p
->next
;
995 swap_info
[prev
].next
= p
->next
;
997 if (type
== swap_list
.next
) {
998 /* just pick something that's safe... */
999 swap_list
.next
= swap_list
.head
;
1001 nr_swap_pages
-= p
->pages
;
1002 total_swap_pages
-= p
->pages
;
1003 p
->flags
&= ~SWP_WRITEOK
;
1005 err
= try_to_unuse(type
);
1007 /* re-insert swap space back into swap_list */
1009 for (prev
= -1, i
= swap_list
.head
; i
>= 0; prev
= i
, i
= swap_info
[i
].next
)
1010 if (p
->prio
>= swap_info
[i
].prio
)
1014 swap_list
.head
= swap_list
.next
= p
- swap_info
;
1016 swap_info
[prev
].next
= p
- swap_info
;
1017 nr_swap_pages
+= p
->pages
;
1018 total_swap_pages
+= p
->pages
;
1019 p
->flags
|= SWP_WRITEOK
;
1024 swap_device_lock(p
);
1025 swap_file
= p
->swap_file
;
1026 p
->swap_file
= NULL
;
1028 swap_map
= p
->swap_map
;
1031 destroy_swap_extents(p
);
1032 swap_device_unlock(p
);
1035 if (S_ISBLK(swap_file
->f_dentry
->d_inode
->i_mode
)) {
1036 struct block_device
*bdev
;
1037 bdev
= swap_file
->f_dentry
->d_inode
->i_bdev
;
1038 set_blocksize(bdev
, p
->old_block_size
);
1041 filp_close(swap_file
, NULL
);
1045 filp_close(victim
, NULL
);
1050 #ifdef CONFIG_PROC_FS
1052 static void *swap_start(struct seq_file
*swap
, loff_t
*pos
)
1054 struct swap_info_struct
*ptr
= swap_info
;
1057 char * page
= (char *) __get_free_page(GFP_KERNEL
);
1059 swap
->private = page
; /* save for swap_show */
1063 return ERR_PTR(-ENOMEM
);
1065 for (i
= 0; i
< nr_swapfiles
; i
++, ptr
++) {
1066 if (!(ptr
->flags
& SWP_USED
) || !ptr
->swap_map
)
1075 static void *swap_next(struct seq_file
*swap
, void *v
, loff_t
*pos
)
1077 struct swap_info_struct
*ptr
= v
;
1078 void *endptr
= (void *) swap_info
+ nr_swapfiles
* sizeof(struct swap_info_struct
);
1080 for (++ptr
; ptr
< (struct swap_info_struct
*) endptr
; ptr
++) {
1081 if (!(ptr
->flags
& SWP_USED
) || !ptr
->swap_map
)
1090 static void swap_stop(struct seq_file
*swap
, void *v
)
1093 free_page((unsigned long) swap
->private);
1094 swap
->private = NULL
;
1097 static int swap_show(struct seq_file
*swap
, void *v
)
1099 struct swap_info_struct
*ptr
= v
;
1105 seq_puts(swap
, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1107 file
= ptr
->swap_file
;
1108 path
= d_path(file
->f_dentry
, file
->f_vfsmnt
, swap
->private, PAGE_SIZE
);
1110 for (j
= 0, usedswap
= 0; j
< ptr
->max
; ++j
)
1111 switch (ptr
->swap_map
[j
]) {
1118 seq_printf(swap
, "%-39s %s\t%d\t%d\t%d\n",
1120 S_ISBLK(file
->f_dentry
->d_inode
->i_mode
) ?
1121 "partition" : "file\t",
1122 ptr
->pages
<< (PAGE_SHIFT
- 10),
1123 usedswap
<< (PAGE_SHIFT
- 10),
1128 static struct seq_operations swaps_op
= {
1129 .start
= swap_start
,
1135 static int swaps_open(struct inode
*inode
, struct file
*file
)
1137 return seq_open(file
, &swaps_op
);
1140 static struct file_operations proc_swaps_operations
= {
1143 .llseek
= seq_lseek
,
1144 .release
= seq_release
,
1147 static int __init
procswaps_init(void)
1149 struct proc_dir_entry
*entry
;
1151 entry
= create_proc_entry("swaps", 0, NULL
);
1153 entry
->proc_fops
= &proc_swaps_operations
;
1156 __initcall(procswaps_init
);
1157 #endif /* CONFIG_PROC_FS */
1160 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1162 * The swapon system call
1164 asmlinkage
long sys_swapon(const char * specialfile
, int swap_flags
)
1166 struct swap_info_struct
* p
;
1168 struct block_device
*bdev
= NULL
;
1169 struct file
*swap_file
= NULL
;
1170 struct address_space
*mapping
;
1174 static int least_priority
= 0;
1175 union swap_header
*swap_header
= 0;
1176 int swap_header_version
;
1177 int nr_good_pages
= 0;
1178 unsigned long maxpages
= 1;
1180 unsigned short *swap_map
;
1181 struct page
*page
= NULL
;
1183 if (!capable(CAP_SYS_ADMIN
))
1187 for (type
= 0 ; type
< nr_swapfiles
; type
++,p
++)
1188 if (!(p
->flags
& SWP_USED
))
1191 if (type
>= MAX_SWAPFILES
) {
1195 if (type
>= nr_swapfiles
)
1196 nr_swapfiles
= type
+1;
1197 INIT_LIST_HEAD(&p
->extent_list
);
1198 p
->flags
= SWP_USED
;
1200 p
->swap_file
= NULL
;
1201 p
->old_block_size
= 0;
1206 p
->sdev_lock
= SPIN_LOCK_UNLOCKED
;
1208 if (swap_flags
& SWAP_FLAG_PREFER
) {
1210 (swap_flags
& SWAP_FLAG_PRIO_MASK
)>>SWAP_FLAG_PRIO_SHIFT
;
1212 p
->prio
= --least_priority
;
1215 name
= getname(specialfile
);
1216 error
= PTR_ERR(name
);
1219 swap_file
= filp_open(name
, O_RDWR
, 0);
1220 error
= PTR_ERR(swap_file
);
1221 if (IS_ERR(swap_file
)) {
1226 p
->swap_file
= swap_file
;
1229 if (S_ISBLK(swap_file
->f_dentry
->d_inode
->i_mode
)) {
1230 bdev
= swap_file
->f_dentry
->d_inode
->i_bdev
;
1231 error
= bd_claim(bdev
, sys_swapon
);
1236 p
->old_block_size
= block_size(bdev
);
1237 error
= set_blocksize(swap_file
->f_dentry
->d_inode
->i_bdev
,
1242 } else if (S_ISREG(swap_file
->f_dentry
->d_inode
->i_mode
)) {
1243 p
->bdev
= swap_file
->f_dentry
->d_inode
->i_sb
->s_bdev
;
1248 mapping
= swap_file
->f_dentry
->d_inode
->i_mapping
;
1249 swapfilesize
= mapping
->host
->i_size
>> PAGE_SHIFT
;
1252 for (i
= 0 ; i
< nr_swapfiles
; i
++) {
1253 struct swap_info_struct
*q
= &swap_info
[i
];
1254 if (i
== type
|| !q
->swap_file
)
1256 if (mapping
== q
->swap_file
->f_dentry
->d_inode
->i_mapping
)
1261 * Read the swap header.
1263 page
= read_cache_page(mapping
, 0,
1264 (filler_t
*)mapping
->a_ops
->readpage
, swap_file
);
1266 error
= PTR_ERR(page
);
1269 wait_on_page_locked(page
);
1270 if (!PageUptodate(page
))
1273 swap_header
= page_address(page
);
1275 if (!memcmp("SWAP-SPACE",swap_header
->magic
.magic
,10))
1276 swap_header_version
= 1;
1277 else if (!memcmp("SWAPSPACE2",swap_header
->magic
.magic
,10))
1278 swap_header_version
= 2;
1280 printk("Unable to find swap-space signature\n");
1285 switch (swap_header_version
) {
1287 printk(KERN_ERR
"version 0 swap is no longer supported. "
1288 "Use mkswap -v1 %s\n", name
);
1292 /* Check the swap header's sub-version and the size of
1293 the swap file and bad block lists */
1294 if (swap_header
->info
.version
!= 1) {
1296 "Unable to handle swap header version %d\n",
1297 swap_header
->info
.version
);
1303 maxpages
= swp_offset(swp_entry(0,~0UL)) - 1;
1304 if (maxpages
> swap_header
->info
.last_page
)
1305 maxpages
= swap_header
->info
.last_page
;
1306 p
->highest_bit
= maxpages
- 1;
1309 if (swap_header
->info
.nr_badpages
> MAX_SWAP_BADPAGES
)
1312 /* OK, set up the swap map and apply the bad block list */
1313 if (!(p
->swap_map
= vmalloc(maxpages
* sizeof(short)))) {
1319 memset(p
->swap_map
, 0, maxpages
* sizeof(short));
1320 for (i
=0; i
<swap_header
->info
.nr_badpages
; i
++) {
1321 int page
= swap_header
->info
.badpages
[i
];
1322 if (page
<= 0 || page
>= swap_header
->info
.last_page
)
1325 p
->swap_map
[page
] = SWAP_MAP_BAD
;
1327 nr_good_pages
= swap_header
->info
.last_page
-
1328 swap_header
->info
.nr_badpages
-
1329 1 /* header page */;
1334 if (swapfilesize
&& maxpages
> swapfilesize
) {
1336 "Swap area shorter than signature indicates\n");
1340 if (!nr_good_pages
) {
1341 printk(KERN_WARNING
"Empty swap-file\n");
1345 p
->swap_map
[0] = SWAP_MAP_BAD
;
1347 p
->pages
= nr_good_pages
;
1349 if (setup_swap_extents(p
))
1353 swap_device_lock(p
);
1354 p
->flags
= SWP_ACTIVE
;
1355 nr_swap_pages
+= nr_good_pages
;
1356 total_swap_pages
+= nr_good_pages
;
1357 printk(KERN_INFO
"Adding %dk swap on %s. Priority:%d extents:%d\n",
1358 nr_good_pages
<<(PAGE_SHIFT
-10), name
,
1359 p
->prio
, p
->nr_extents
);
1361 /* insert swap space into swap_list: */
1363 for (i
= swap_list
.head
; i
>= 0; i
= swap_info
[i
].next
) {
1364 if (p
->prio
>= swap_info
[i
].prio
) {
1371 swap_list
.head
= swap_list
.next
= p
- swap_info
;
1373 swap_info
[prev
].next
= p
- swap_info
;
1375 swap_device_unlock(p
);
1381 set_blocksize(bdev
, p
->old_block_size
);
1386 swap_map
= p
->swap_map
;
1387 p
->swap_file
= NULL
;
1390 if (!(swap_flags
& SWAP_FLAG_PREFER
))
1393 destroy_swap_extents(p
);
1396 if (swap_file
&& !IS_ERR(swap_file
))
1397 filp_close(swap_file
, NULL
);
1399 if (page
&& !IS_ERR(page
)) {
1401 page_cache_release(page
);
1408 void si_swapinfo(struct sysinfo
*val
)
1411 unsigned long nr_to_be_unused
= 0;
1414 for (i
= 0; i
< nr_swapfiles
; i
++) {
1416 if (!(swap_info
[i
].flags
& SWP_USED
) ||
1417 (swap_info
[i
].flags
& SWP_WRITEOK
))
1419 for (j
= 0; j
< swap_info
[i
].max
; ++j
) {
1420 switch (swap_info
[i
].swap_map
[j
]) {
1429 val
->freeswap
= nr_swap_pages
+ nr_to_be_unused
;
1430 val
->totalswap
= total_swap_pages
+ nr_to_be_unused
;
1435 * Verify that a swap entry is valid and increment its swap map count.
1437 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1438 * "permanent", but will be reclaimed by the next swapoff.
1440 int swap_duplicate(swp_entry_t entry
)
1442 struct swap_info_struct
* p
;
1443 unsigned long offset
, type
;
1446 type
= swp_type(entry
);
1447 if (type
>= nr_swapfiles
)
1449 p
= type
+ swap_info
;
1450 offset
= swp_offset(entry
);
1452 swap_device_lock(p
);
1453 if (offset
< p
->max
&& p
->swap_map
[offset
]) {
1454 if (p
->swap_map
[offset
] < SWAP_MAP_MAX
- 1) {
1455 p
->swap_map
[offset
]++;
1457 } else if (p
->swap_map
[offset
] <= SWAP_MAP_MAX
) {
1458 if (swap_overflow
++ < 5)
1459 printk(KERN_WARNING
"swap_dup: swap entry overflow\n");
1460 p
->swap_map
[offset
] = SWAP_MAP_MAX
;
1464 swap_device_unlock(p
);
1469 printk(KERN_ERR
"swap_dup: %s%08lx\n", Bad_file
, entry
.val
);
1473 struct swap_info_struct
*
1474 get_swap_info_struct(unsigned type
)
1476 return &swap_info
[type
];
1480 * swap_device_lock prevents swap_map being freed. Don't grab an extra
1481 * reference on the swaphandle, it doesn't matter if it becomes unused.
1483 int valid_swaphandles(swp_entry_t entry
, unsigned long *offset
)
1485 int ret
= 0, i
= 1 << page_cluster
;
1487 struct swap_info_struct
*swapdev
= swp_type(entry
) + swap_info
;
1489 if (!page_cluster
) /* no readahead */
1491 toff
= (swp_offset(entry
) >> page_cluster
) << page_cluster
;
1492 if (!toff
) /* first page is swap header */
1496 swap_device_lock(swapdev
);
1498 /* Don't read-ahead past the end of the swap area */
1499 if (toff
>= swapdev
->max
)
1501 /* Don't read in free or bad pages */
1502 if (!swapdev
->swap_map
[toff
])
1504 if (swapdev
->swap_map
[toff
] == SWAP_MAP_BAD
)
1509 swap_device_unlock(swapdev
);