mm/swapfile.c

   1 /*
   2  *  linux/mm/swapfile.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  */
   7
   8 #include <linux/config.h>
   9 #include <linux/mm.h>
  10 #include <linux/slab.h>
  11 #include <linux/kernel_stat.h>
  12 #include <linux/swap.h>
  13 #include <linux/vmalloc.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/namei.h>
  16 #include <linux/shm.h>
  17 #include <linux/blkdev.h>
  18 #include <linux/buffer_head.h>
  19 #include <linux/proc_fs.h>
  20 #include <linux/seq_file.h>
  21 #include <linux/init.h>
  22
  23 #include <asm/pgtable.h>
  24 #include <linux/swapops.h>
  25
  26 spinlock_t swaplock = SPIN_LOCK_UNLOCKED;
  27 unsigned int nr_swapfiles;
  28 int total_swap_pages;
  29 static int swap_overflow;
  30
  31 static const char Bad_file[] = "Bad swap file entry ";
  32 static const char Unused_file[] = "Unused swap file entry ";
  33 static const char Bad_offset[] = "Bad swap offset entry ";
  34 static const char Unused_offset[] = "Unused swap offset entry ";
  35
  36 struct swap_list_t swap_list = {-1, -1};
  37
  38 struct swap_info_struct swap_info[MAX_SWAPFILES];
  39
  40 #define SWAPFILE_CLUSTER 256
  41
  42 static inline int scan_swap_map(struct swap_info_struct *si)
  43 {
  44         unsigned long offset;
  45         /*
  46          * We try to cluster swap pages by allocating them
  47          * sequentially in swap.  Once we've allocated
  48          * SWAPFILE_CLUSTER pages this way, however, we resort to
  49          * first-free allocation, starting a new cluster.  This
  50          * prevents us from scattering swap pages all over the entire
  51          * swap partition, so that we reduce overall disk seek times
  52          * between swap pages.  -- sct */
  53         if (si->cluster_nr) {
  54                 while (si->cluster_next <= si->highest_bit) {
  55                         offset = si->cluster_next++;
  56                         if (si->swap_map[offset])
  57                                 continue;
  58                         si->cluster_nr--;
  59                         goto got_page;
  60                 }
  61         }
  62         si->cluster_nr = SWAPFILE_CLUSTER;
  63
  64         /* try to find an empty (even not aligned) cluster. */
  65         offset = si->lowest_bit;
  66  check_next_cluster:
  67         if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
  68         {
  69                 int nr;
  70                 for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
  71                         if (si->swap_map[nr])
  72                         {
  73                                 offset = nr+1;
  74                                 goto check_next_cluster;
  75                         }
  76                 /* We found a completly empty cluster, so start
  77                  * using it.
  78                  */
  79                 goto got_page;
  80         }
  81         /* No luck, so now go finegrined as usual. -Andrea */
  82         for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
  83                 if (si->swap_map[offset])
  84                         continue;
  85                 si->lowest_bit = offset+1;
  86         got_page:
  87                 if (offset == si->lowest_bit)
  88                         si->lowest_bit++;
  89                 if (offset == si->highest_bit)
  90                         si->highest_bit--;
  91                 if (si->lowest_bit > si->highest_bit) {
  92                         si->lowest_bit = si->max;
  93                         si->highest_bit = 0;
  94                 }
  95                 si->swap_map[offset] = 1;
  96                 nr_swap_pages--;
  97                 si->cluster_next = offset+1;
  98                 return offset;
  99         }
 100         si->lowest_bit = si->max;
 101         si->highest_bit = 0;
 102         return 0;
 103 }
 104
 105 swp_entry_t get_swap_page(void)
 106 {
 107         struct swap_info_struct * p;
 108         unsigned long offset;
 109         swp_entry_t entry;
 110         int type, wrapped = 0;
 111
 112         entry.val = 0;  /* Out of memory */
 113         swap_list_lock();
 114         type = swap_list.next;
 115         if (type < 0)
 116                 goto out;
 117         if (nr_swap_pages <= 0)
 118                 goto out;
 119
 120         while (1) {
 121                 p = &swap_info[type];
 122                 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
 123                         swap_device_lock(p);
 124                         offset = scan_swap_map(p);
 125                         swap_device_unlock(p);
 126                         if (offset) {
 127                                 entry = swp_entry(type,offset);
 128                                 type = swap_info[type].next;
 129                                 if (type < 0 ||
 130                                         p->prio != swap_info[type].prio) {
 131                                                 swap_list.next = swap_list.head;
 132                                 } else {
 133                                         swap_list.next = type;
 134                                 }
 135                                 goto out;
 136                         }
 137                 }
 138                 type = p->next;
 139                 if (!wrapped) {
 140                         if (type < 0 || p->prio != swap_info[type].prio) {
 141                                 type = swap_list.head;
 142                                 wrapped = 1;
 143                         }
 144                 } else
 145                         if (type < 0)
 146                                 goto out;       /* out of swap space */
 147         }
 148 out:
 149         swap_list_unlock();
 150         return entry;
 151 }
 152
 153 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 154 {
 155         struct swap_info_struct * p;
 156         unsigned long offset, type;
 157
 158         if (!entry.val)
 159                 goto out;
 160         type = swp_type(entry);
 161         if (type >= nr_swapfiles)
 162                 goto bad_nofile;
 163         p = & swap_info[type];
 164         if (!(p->flags & SWP_USED))
 165                 goto bad_device;
 166         offset = swp_offset(entry);
 167         if (offset >= p->max)
 168                 goto bad_offset;
 169         if (!p->swap_map[offset])
 170                 goto bad_free;
 171         swap_list_lock();
 172         if (p->prio > swap_info[swap_list.next].prio)
 173                 swap_list.next = type;
 174         swap_device_lock(p);
 175         return p;
 176
 177 bad_free:
 178         printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
 179         goto out;
 180 bad_offset:
 181         printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
 182         goto out;
 183 bad_device:
 184         printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
 185         goto out;
 186 bad_nofile:
 187         printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 188 out:
 189         return NULL;
 190 }
 191
 192 static void swap_info_put(struct swap_info_struct * p)
 193 {
 194         swap_device_unlock(p);
 195         swap_list_unlock();
 196 }
 197
 198 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 199 {
 200         int count = p->swap_map[offset];
 201
 202         if (count < SWAP_MAP_MAX) {
 203                 count--;
 204                 p->swap_map[offset] = count;
 205                 if (!count) {
 206                         if (offset < p->lowest_bit)
 207                                 p->lowest_bit = offset;
 208                         if (offset > p->highest_bit)
 209                                 p->highest_bit = offset;
 210                         nr_swap_pages++;
 211                 }
 212         }
 213         return count;
 214 }
 215
 216 /*
 217  * Caller has made sure that the swapdevice corresponding to entry
 218  * is still around or has not been recycled.
 219  */
 220 void swap_free(swp_entry_t entry)
 221 {
 222         struct swap_info_struct * p;
 223
 224         p = swap_info_get(entry);
 225         if (p) {
 226                 swap_entry_free(p, swp_offset(entry));
 227                 swap_info_put(p);
 228         }
 229 }
 230
 231 /*
 232  * Check if we're the only user of a swap page,
 233  * when the page is locked.
 234  */
 235 static int exclusive_swap_page(struct page *page)
 236 {
 237         int retval = 0;
 238         struct swap_info_struct * p;
 239         swp_entry_t entry;
 240
 241         entry.val = page->index;
 242         p = swap_info_get(entry);
 243         if (p) {
 244                 /* Is the only swap cache user the cache itself? */
 245                 if (p->swap_map[swp_offset(entry)] == 1) {
 246                         /* Recheck the page count with the pagecache lock held.. */
 247                         read_lock(&swapper_space.page_lock);
 248                         if (page_count(page) - !!PagePrivate(page) == 2)
 249                                 retval = 1;
 250                         read_unlock(&swapper_space.page_lock);
 251                 }
 252                 swap_info_put(p);
 253         }
 254         return retval;
 255 }
 256
 257 /*
 258  * We can use this swap cache entry directly
 259  * if there are no other references to it.
 260  *
 261  * Here "exclusive_swap_page()" does the real
 262  * work, but we opportunistically check whether
 263  * we need to get all the locks first..
 264  */
 265 int can_share_swap_page(struct page *page)
 266 {
 267         int retval = 0;
 268
 269         if (!PageLocked(page))
 270                 BUG();
 271         switch (page_count(page)) {
 272         case 3:
 273                 if (!PagePrivate(page))
 274                         break;
 275                 /* Fallthrough */
 276         case 2:
 277                 if (!PageSwapCache(page))
 278                         break;
 279                 retval = exclusive_swap_page(page);
 280                 break;
 281         case 1:
 282                 if (PageReserved(page))
 283                         break;
 284                 retval = 1;
 285         }
 286         return retval;
 287 }
 288
 289 /*
 290  * Work out if there are any other processes sharing this
 291  * swap cache page. Free it if you can. Return success.
 292  */
 293 int remove_exclusive_swap_page(struct page *page)
 294 {
 295         int retval;
 296         struct swap_info_struct * p;
 297         swp_entry_t entry;
 298
 299         BUG_ON(page_has_buffers(page));
 300         BUG_ON(!PageLocked(page));
 301
 302         if (!PageSwapCache(page))
 303                 return 0;
 304         if (PageWriteback(page))
 305                 return 0;
 306         if (page_count(page) != 2) /* 2: us + cache */
 307                 return 0;
 308
 309         entry.val = page->index;
 310         p = swap_info_get(entry);
 311         if (!p)
 312                 return 0;
 313
 314         /* Is the only swap cache user the cache itself? */
 315         retval = 0;
 316         if (p->swap_map[swp_offset(entry)] == 1) {
 317                 /* Recheck the page count with the pagecache lock held.. */
 318                 write_lock(&swapper_space.page_lock);
 319                 if ((page_count(page) == 2) && !PageWriteback(page)) {
 320                         __delete_from_swap_cache(page);
 321                         SetPageDirty(page);
 322                         retval = 1;
 323                 }
 324                 write_unlock(&swapper_space.page_lock);
 325         }
 326         swap_info_put(p);
 327
 328         if (retval) {
 329                 swap_free(entry);
 330                 page_cache_release(page);
 331         }
 332
 333         return retval;
 334 }
 335
 336 /*
 337  * Free the swap entry like above, but also try to
 338  * free the page cache entry if it is the last user.
 339  */
 340 void free_swap_and_cache(swp_entry_t entry)
 341 {
 342         struct swap_info_struct * p;
 343         struct page *page = NULL;
 344
 345         p = swap_info_get(entry);
 346         if (p) {
 347                 if (swap_entry_free(p, swp_offset(entry)) == 1)
 348                         page = find_trylock_page(&swapper_space, entry.val);
 349                 swap_info_put(p);
 350         }
 351         if (page) {
 352                 int one_user;
 353
 354                 BUG_ON(page_has_buffers(page));
 355                 page_cache_get(page);
 356                 one_user = (page_count(page) == 2);
 357                 /* Only cache user (+us), or swap space full? Free it! */
 358                 if (!PageWriteback(page) && (one_user || vm_swap_full())) {
 359                         delete_from_swap_cache(page);
 360                         SetPageDirty(page);
 361                 }
 362                 unlock_page(page);
 363                 page_cache_release(page);
 364         }
 365 }
 366
 367 /*
 368  * The swap entry has been read in advance, and we return 1 to indicate
 369  * that the page has been used or is no longer needed.
 370  *
 371  * Always set the resulting pte to be nowrite (the same as COW pages
 372  * after one process has exited).  We don't know just how many PTEs will
 373  * share this swap entry, so be cautious and let do_wp_page work out
 374  * what to do if a write is requested later.
 375  */
 376 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 377 static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
 378         pte_t *dir, swp_entry_t entry, struct page* page)
 379 {
 380         pte_t pte = *dir;
 381
 382         if (likely(pte_to_swp_entry(pte).val != entry.val))
 383                 return;
 384         if (unlikely(pte_none(pte) || pte_present(pte)))
 385                 return;
 386         get_page(page);
 387         set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
 388         page_add_rmap(page, dir);
 389         swap_free(entry);
 390         ++vma->vm_mm->rss;
 391 }
 392
 393 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 394 static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
 395         unsigned long address, unsigned long size, unsigned long offset,
 396         swp_entry_t entry, struct page* page)
 397 {
 398         pte_t * pte;
 399         unsigned long end;
 400
 401         if (pmd_none(*dir))
 402                 return;
 403         if (pmd_bad(*dir)) {
 404                 pmd_ERROR(*dir);
 405                 pmd_clear(dir);
 406                 return;
 407         }
 408         pte = pte_offset_map(dir, address);
 409         offset += address & PMD_MASK;
 410         address &= ~PMD_MASK;
 411         end = address + size;
 412         if (end > PMD_SIZE)
 413                 end = PMD_SIZE;
 414         do {
 415                 unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
 416                 address += PAGE_SIZE;
 417                 pte++;
 418         } while (address && (address < end));
 419         pte_unmap(pte - 1);
 420 }
 421
 422 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 423 static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
 424         unsigned long address, unsigned long size,
 425         swp_entry_t entry, struct page* page)
 426 {
 427         pmd_t * pmd;
 428         unsigned long offset, end;
 429
 430         if (pgd_none(*dir))
 431                 return;
 432         if (pgd_bad(*dir)) {
 433                 pgd_ERROR(*dir);
 434                 pgd_clear(dir);
 435                 return;
 436         }
 437         pmd = pmd_offset(dir, address);
 438         offset = address & PGDIR_MASK;
 439         address &= ~PGDIR_MASK;
 440         end = address + size;
 441         if (end > PGDIR_SIZE)
 442                 end = PGDIR_SIZE;
 443         if (address >= end)
 444                 BUG();
 445         do {
 446                 unuse_pmd(vma, pmd, address, end - address, offset, entry,
 447                           page);
 448                 address = (address + PMD_SIZE) & PMD_MASK;
 449                 pmd++;
 450         } while (address && (address < end));
 451 }
 452
 453 /* mmlist_lock and vma->vm_mm->page_table_lock are held */
 454 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
 455                         swp_entry_t entry, struct page* page)
 456 {
 457         unsigned long start = vma->vm_start, end = vma->vm_end;
 458
 459         if (start >= end)
 460                 BUG();
 461         do {
 462                 unuse_pgd(vma, pgdir, start, end - start, entry, page);
 463                 start = (start + PGDIR_SIZE) & PGDIR_MASK;
 464                 pgdir++;
 465         } while (start && (start < end));
 466 }
 467
 468 static void unuse_process(struct mm_struct * mm,
 469                         swp_entry_t entry, struct page* page)
 470 {
 471         struct vm_area_struct* vma;
 472
 473         /*
 474          * Go through process' page directory.
 475          */
 476         spin_lock(&mm->page_table_lock);
 477         for (vma = mm->mmap; vma; vma = vma->vm_next) {
 478                 pgd_t * pgd = pgd_offset(mm, vma->vm_start);
 479                 unuse_vma(vma, pgd, entry, page);
 480         }
 481         spin_unlock(&mm->page_table_lock);
 482         return;
 483 }
 484
 485 /*
 486  * Scan swap_map from current position to next entry still in use.
 487  * Recycle to start on reaching the end, returning 0 when empty.
 488  */
 489 static int find_next_to_unuse(struct swap_info_struct *si, int prev)
 490 {
 491         int max = si->max;
 492         int i = prev;
 493         int count;
 494
 495         /*
 496          * No need for swap_device_lock(si) here: we're just looking
 497          * for whether an entry is in use, not modifying it; false
 498          * hits are okay, and sys_swapoff() has already prevented new
 499          * allocations from this area (while holding swap_list_lock()).
 500          */
 501         for (;;) {
 502                 if (++i >= max) {
 503                         if (!prev) {
 504                                 i = 0;
 505                                 break;
 506                         }
 507                         /*
 508                          * No entries in use at top of swap_map,
 509                          * loop back to start and recheck there.
 510                          */
 511                         max = prev + 1;
 512                         prev = 0;
 513                         i = 1;
 514                 }
 515                 count = si->swap_map[i];
 516                 if (count && count != SWAP_MAP_BAD)
 517                         break;
 518         }
 519         return i;
 520 }
 521
 522 /*
 523  * We completely avoid races by reading each swap page in advance,
 524  * and then search for the process using it.  All the necessary
 525  * page table adjustments can then be made atomically.
 526  */
 527 static int try_to_unuse(unsigned int type)
 528 {
 529         struct swap_info_struct * si = &swap_info[type];
 530         struct mm_struct *start_mm;
 531         unsigned short *swap_map;
 532         unsigned short swcount;
 533         struct page *page;
 534         swp_entry_t entry;
 535         int i = 0;
 536         int retval = 0;
 537         int reset_overflow = 0;
 538         int shmem;
 539
 540         /*
 541          * When searching mms for an entry, a good strategy is to
 542          * start at the first mm we freed the previous entry from
 543          * (though actually we don't notice whether we or coincidence
 544          * freed the entry).  Initialize this start_mm with a hold.
 545          *
 546          * A simpler strategy would be to start at the last mm we
 547          * freed the previous entry from; but that would take less
 548          * advantage of mmlist ordering (now preserved by swap_out()),
 549          * which clusters forked address spaces together, most recent
 550          * child immediately after parent.  If we race with dup_mmap(),
 551          * we very much want to resolve parent before child, otherwise
 552          * we may miss some entries: using last mm would invert that.
 553          */
 554         start_mm = &init_mm;
 555         atomic_inc(&init_mm.mm_users);
 556
 557         /*
 558          * Keep on scanning until all entries have gone.  Usually,
 559          * one pass through swap_map is enough, but not necessarily:
 560          * mmput() removes mm from mmlist before exit_mmap() and its
 561          * zap_page_range().  That's not too bad, those entries are
 562          * on their way out, and handled faster there than here.
 563          * do_munmap() behaves similarly, taking the range out of mm's
 564          * vma list before zap_page_range().  But unfortunately, when
 565          * unmapping a part of a vma, it takes the whole out first,
 566          * then reinserts what's left after (might even reschedule if
 567          * open() method called) - so swap entries may be invisible
 568          * to swapoff for a while, then reappear - but that is rare.
 569          */
 570         while ((i = find_next_to_unuse(si, i))) {
 571                 /*
 572                  * Get a page for the entry, using the existing swap
 573                  * cache page if there is one.  Otherwise, get a clean
 574                  * page and read the swap into it.
 575                  */
 576                 swap_map = &si->swap_map[i];
 577                 entry = swp_entry(type, i);
 578                 page = read_swap_cache_async(entry);
 579                 if (!page) {
 580                         /*
 581                          * Either swap_duplicate() failed because entry
 582                          * has been freed independently, and will not be
 583                          * reused since sys_swapoff() already disabled
 584                          * allocation from here, or alloc_page() failed.
 585                          */
 586                         if (!*swap_map)
 587                                 continue;
 588                         retval = -ENOMEM;
 589                         break;
 590                 }
 591
 592                 /*
 593                  * Don't hold on to start_mm if it looks like exiting.
 594                  */
 595                 if (atomic_read(&start_mm->mm_users) == 1) {
 596                         mmput(start_mm);
 597                         start_mm = &init_mm;
 598                         atomic_inc(&init_mm.mm_users);
 599                 }
 600
 601                 /*
 602                  * Wait for and lock page.  When do_swap_page races with
 603                  * try_to_unuse, do_swap_page can handle the fault much
 604                  * faster than try_to_unuse can locate the entry.  This
 605                  * apparently redundant "wait_on_page_locked" lets try_to_unuse
 606                  * defer to do_swap_page in such a case - in some tests,
 607                  * do_swap_page and try_to_unuse repeatedly compete.
 608                  */
 609                 wait_on_page_locked(page);
 610                 wait_on_page_writeback(page);
 611                 lock_page(page);
 612                 wait_on_page_writeback(page);
 613
 614                 /*
 615                  * Remove all references to entry, without blocking.
 616                  * Whenever we reach init_mm, there's no address space
 617                  * to search, but use it as a reminder to search shmem.
 618                  */
 619                 shmem = 0;
 620                 swcount = *swap_map;
 621                 if (swcount > 1) {
 622                         flush_page_to_ram(page);
 623                         if (start_mm == &init_mm)
 624                                 shmem = shmem_unuse(entry, page);
 625                         else
 626                                 unuse_process(start_mm, entry, page);
 627                 }
 628                 if (*swap_map > 1) {
 629                         int set_start_mm = (*swap_map >= swcount);
 630                         struct list_head *p = &start_mm->mmlist;
 631                         struct mm_struct *new_start_mm = start_mm;
 632                         struct mm_struct *mm;
 633
 634                         spin_lock(&mmlist_lock);
 635                         while (*swap_map > 1 &&
 636                                         (p = p->next) != &start_mm->mmlist) {
 637                                 mm = list_entry(p, struct mm_struct, mmlist);
 638                                 swcount = *swap_map;
 639                                 if (mm == &init_mm) {
 640                                         set_start_mm = 1;
 641                                         spin_unlock(&mmlist_lock);
 642                                         shmem = shmem_unuse(entry, page);
 643                                         spin_lock(&mmlist_lock);
 644                                 } else
 645                                         unuse_process(mm, entry, page);
 646                                 if (set_start_mm && *swap_map < swcount) {
 647                                         new_start_mm = mm;
 648                                         set_start_mm = 0;
 649                                 }
 650                         }
 651                         atomic_inc(&new_start_mm->mm_users);
 652                         spin_unlock(&mmlist_lock);
 653                         mmput(start_mm);
 654                         start_mm = new_start_mm;
 655                 }
 656
 657                 /*
 658                  * How could swap count reach 0x7fff when the maximum
 659                  * pid is 0x7fff, and there's no way to repeat a swap
 660                  * page within an mm (except in shmem, where it's the
 661                  * shared object which takes the reference count)?
 662                  * We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
 663                  *
 664                  * If that's wrong, then we should worry more about
 665                  * exit_mmap() and do_munmap() cases described above:
 666                  * we might be resetting SWAP_MAP_MAX too early here.
 667                  * We know "Undead"s can happen, they're okay, so don't
 668                  * report them; but do report if we reset SWAP_MAP_MAX.
 669                  */
 670                 if (*swap_map == SWAP_MAP_MAX) {
 671                         swap_list_lock();
 672                         swap_device_lock(si);
 673                         nr_swap_pages++;
 674                         *swap_map = 1;
 675                         swap_device_unlock(si);
 676                         swap_list_unlock();
 677                         reset_overflow = 1;
 678                 }
 679
 680                 /*
 681                  * If a reference remains (rare), we would like to leave
 682                  * the page in the swap cache; but try_to_swap_out could
 683                  * then re-duplicate the entry once we drop page lock,
 684                  * so we might loop indefinitely; also, that page could
 685                  * not be swapped out to other storage meanwhile.  So:
 686                  * delete from cache even if there's another reference,
 687                  * after ensuring that the data has been saved to disk -
 688                  * since if the reference remains (rarer), it will be
 689                  * read from disk into another page.  Splitting into two
 690                  * pages would be incorrect if swap supported "shared
 691                  * private" pages, but they are handled by tmpfs files.
 692                  *
 693                  * Note shmem_unuse already deleted a swappage from
 694                  * the swap cache, unless the move to filepage failed:
 695                  * in which case it left swappage in cache, lowered its
 696                  * swap count to pass quickly through the loops above,
 697                  * and now we must reincrement count to try again later.
 698                  */
 699                 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 700                         swap_writepage(page);
 701                         lock_page(page);
 702                         wait_on_page_writeback(page);
 703                 }
 704                 if (PageSwapCache(page)) {
 705                         if (shmem)
 706                                 swap_duplicate(entry);
 707                         else
 708                                 delete_from_swap_cache(page);
 709                 }
 710
 711                 /*
 712                  * So we could skip searching mms once swap count went
 713                  * to 1, we did not mark any present ptes as dirty: must
 714                  * mark page dirty so try_to_swap_out will preserve it.
 715                  */
 716                 SetPageDirty(page);
 717                 unlock_page(page);
 718                 page_cache_release(page);
 719
 720                 /*
 721                  * Make sure that we aren't completely killing
 722                  * interactive performance.  Interruptible check on
 723                  * signal_pending() would be nice, but changes the spec?
 724                  */
 725                 if (need_resched())
 726                         schedule();
 727         }
 728
 729         mmput(start_mm);
 730         if (reset_overflow) {
 731                 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
 732                 swap_overflow = 0;
 733         }
 734         return retval;
 735 }
 736
 737 /*
 738  * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
 739  * corresponds to page offset `offset'.
 740  */
 741 sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 742 {
 743         struct swap_extent *se = sis->curr_swap_extent;
 744         struct swap_extent *start_se = se;
 745
 746         for ( ; ; ) {
 747                 struct list_head *lh;
 748
 749                 if (se->start_page <= offset &&
 750                                 offset < (se->start_page + se->nr_pages)) {
 751                         return se->start_block + (offset - se->start_page);
 752                 }
 753                 lh = se->list.prev;
 754                 if (lh == &sis->extent_list)
 755                         lh = lh->prev;
 756                 se = list_entry(lh, struct swap_extent, list);
 757                 sis->curr_swap_extent = se;
 758                 BUG_ON(se == start_se);         /* It *must* be present */
 759         }
 760 }
 761
 762 /*
 763  * Free all of a swapdev's extent information
 764  */
 765 static void destroy_swap_extents(struct swap_info_struct *sis)
 766 {
 767         while (!list_empty(&sis->extent_list)) {
 768                 struct swap_extent *se;
 769
 770                 se = list_entry(sis->extent_list.next,
 771                                 struct swap_extent, list);
 772                 list_del(&se->list);
 773                 kfree(se);
 774         }
 775         sis->nr_extents = 0;
 776 }
 777
 778 /*
 779  * Add a block range (and the corresponding page range) into this swapdev's
 780  * extent list.  The extent list is kept sorted in block order.
 781  *
 782  * This function rather assumes that it is called in ascending sector_t order.
 783  * It doesn't look for extent coalescing opportunities.
 784  */
 785 static int
 786 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 787                 unsigned long nr_pages, sector_t start_block)
 788 {
 789         struct swap_extent *se;
 790         struct swap_extent *new_se;
 791         struct list_head *lh;
 792
 793         lh = sis->extent_list.next;     /* The highest-addressed block */
 794         while (lh != &sis->extent_list) {
 795                 se = list_entry(lh, struct swap_extent, list);
 796                 if (se->start_block + se->nr_pages == start_block) {
 797                         /* Merge it */
 798                         se->nr_pages += nr_pages;
 799                         return 0;
 800                 }
 801                 lh = lh->next;
 802         }
 803
 804         /*
 805          * No merge.  Insert a new extent, preserving ordering.
 806          */
 807         new_se = kmalloc(sizeof(*se), GFP_KERNEL);
 808         if (new_se == NULL)
 809                 return -ENOMEM;
 810         new_se->start_page = start_page;
 811         new_se->nr_pages = nr_pages;
 812         new_se->start_block = start_block;
 813
 814         lh = sis->extent_list.prev;     /* The lowest block */
 815         while (lh != &sis->extent_list) {
 816                 se = list_entry(lh, struct swap_extent, list);
 817                 if (se->start_block > start_block)
 818                         break;
 819                 lh = lh->prev;
 820         }
 821         list_add_tail(&new_se->list, lh);
 822         sis->nr_extents++;
 823         return 0;
 824 }
 825
 826 /*
 827  * A `swap extent' is a simple thing which maps a contiguous range of pages
 828  * onto a contiguous range of disk blocks.  An ordered list of swap extents
 829  * is built at swapon time and is then used at swap_writepage/swap_readpage
 830  * time for locating where on disk a page belongs.
 831  *
 832  * If the swapfile is an S_ISBLK block device, a single extent is installed.
 833  * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 834  * swap files identically.
 835  *
 836  * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 837  * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 838  * swapfiles are handled *identically* after swapon time.
 839  *
 840  * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 841  * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
 842  * some stray blocks are found which do not fall within the PAGE_SIZE alignment
 843  * requirements, they are simply tossed out - we will never use those blocks
 844  * for swapping.
 845  *
 846  * The amount of disk space which a single swap extent represents varies.
 847  * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 848  * extents in the list.  To avoid much list walking, we cache the previous
 849  * search location in `curr_swap_extent', and start new searches from there.
 850  * This is extremely effective.  The average number of iterations in
 851  * map_swap_page() has been measured at about 0.3 per page.  - akpm.
 852  */
 853 static int setup_swap_extents(struct swap_info_struct *sis)
 854 {
 855         struct inode *inode;
 856         unsigned blocks_per_page;
 857         unsigned long page_no;
 858         unsigned blkbits;
 859         sector_t probe_block;
 860         sector_t last_block;
 861         int ret;
 862
 863         inode = sis->swap_file->f_dentry->d_inode;
 864         if (S_ISBLK(inode->i_mode)) {
 865                 ret = add_swap_extent(sis, 0, sis->max, 0);
 866                 goto done;
 867         }
 868
 869         blkbits = inode->i_blkbits;
 870         blocks_per_page = PAGE_SIZE >> blkbits;
 871
 872         /*
 873          * Map all the blocks into the extent list.  This code doesn't try
 874          * to be very smart.
 875          */
 876         probe_block = 0;
 877         page_no = 0;
 878         last_block = inode->i_size >> blkbits;
 879         while ((probe_block + blocks_per_page) <= last_block &&
 880                         page_no < sis->max) {
 881                 unsigned block_in_page;
 882                 sector_t first_block;
 883
 884                 first_block = bmap(inode, probe_block);
 885                 if (first_block == 0)
 886                         goto bad_bmap;
 887
 888                 /*
 889                  * It must be PAGE_SIZE aligned on-disk
 890                  */
 891                 if (first_block & (blocks_per_page - 1)) {
 892                         probe_block++;
 893                         goto reprobe;
 894                 }
 895
 896                 for (block_in_page = 1; block_in_page < blocks_per_page;
 897                                         block_in_page++) {
 898                         sector_t block;
 899
 900                         block = bmap(inode, probe_block + block_in_page);
 901                         if (block == 0)
 902                                 goto bad_bmap;
 903                         if (block != first_block + block_in_page) {
 904                                 /* Discontiguity */
 905                                 probe_block++;
 906                                 goto reprobe;
 907                         }
 908                 }
 909
 910                 /*
 911                  * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 912                  */
 913                 ret = add_swap_extent(sis, page_no, 1,
 914                                 first_block >> (PAGE_SHIFT - blkbits));
 915                 if (ret)
 916                         goto out;
 917                 page_no++;
 918                 probe_block += blocks_per_page;
 919 reprobe:
 920                 continue;
 921         }
 922         ret = 0;
 923         if (page_no == 0)
 924                 ret = -EINVAL;
 925         sis->max = page_no;
 926         sis->highest_bit = page_no - 1;
 927 done:
 928         sis->curr_swap_extent = list_entry(sis->extent_list.prev,
 929                                         struct swap_extent, list);
 930         goto out;
 931 bad_bmap:
 932         printk(KERN_ERR "swapon: swapfile has holes\n");
 933         ret = -EINVAL;
 934 out:
 935         return ret;
 936 }
 937
 938 #if 0   /* We don't need this yet */
 939 #include <linux/backing-dev.h>
 940 int page_queue_congested(struct page *page)
 941 {
 942         struct backing_dev_info *bdi;
 943
 944         BUG_ON(!PageLocked(page));      /* It pins the swap_info_struct */
 945
 946         bdi = page->mapping->backing_dev_info;
 947         if (PageSwapCache(page)) {
 948                 swp_entry_t entry = { .val = page->index };
 949                 struct swap_info_struct *sis;
 950
 951                 sis = get_swap_info_struct(swp_type(entry));
 952                 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
 953         }
 954         return bdi_write_congested(bdi);
 955 }
 956 #endif
 957
 958 asmlinkage long sys_swapoff(const char * specialfile)
 959 {
 960         struct swap_info_struct * p = NULL;
 961         unsigned short *swap_map;
 962         struct file *swap_file, *victim;
 963         struct address_space *mapping;
 964         int i, type, prev;
 965         int err;
 966
 967         if (!capable(CAP_SYS_ADMIN))
 968                 return -EPERM;
 969
 970         victim = filp_open(specialfile, O_RDWR, 0);
 971         err = PTR_ERR(victim);
 972         if (IS_ERR(victim))
 973                 goto out;
 974
 975         mapping = victim->f_dentry->d_inode->i_mapping;
 976         prev = -1;
 977         swap_list_lock();
 978         for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
 979                 p = swap_info + type;
 980                 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
 981                         if (p->swap_file->f_dentry->d_inode->i_mapping==mapping)
 982                                 break;
 983                 }
 984                 prev = type;
 985         }
 986         err = -EINVAL;
 987         if (type < 0) {
 988                 swap_list_unlock();
 989                 goto out_dput;
 990         }
 991
 992         if (prev < 0) {
 993                 swap_list.head = p->next;
 994         } else {
 995                 swap_info[prev].next = p->next;
 996         }
 997         if (type == swap_list.next) {
 998                 /* just pick something that's safe... */
 999                 swap_list.next = swap_list.head;
1000         }
1001         nr_swap_pages -= p->pages;
1002         total_swap_pages -= p->pages;
1003         p->flags &= ~SWP_WRITEOK;
1004         swap_list_unlock();
1005         err = try_to_unuse(type);
1006         if (err) {
1007                 /* re-insert swap space back into swap_list */
1008                 swap_list_lock();
1009                 for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
1010                         if (p->prio >= swap_info[i].prio)
1011                                 break;
1012                 p->next = i;
1013                 if (prev < 0)
1014                         swap_list.head = swap_list.next = p - swap_info;
1015                 else
1016                         swap_info[prev].next = p - swap_info;
1017                 nr_swap_pages += p->pages;
1018                 total_swap_pages += p->pages;
1019                 p->flags |= SWP_WRITEOK;
1020                 swap_list_unlock();
1021                 goto out_dput;
1022         }
1023         swap_list_lock();
1024         swap_device_lock(p);
1025         swap_file = p->swap_file;
1026         p->swap_file = NULL;
1027         p->max = 0;
1028         swap_map = p->swap_map;
1029         p->swap_map = NULL;
1030         p->flags = 0;
1031         destroy_swap_extents(p);
1032         swap_device_unlock(p);
1033         swap_list_unlock();
1034         vfree(swap_map);
1035         if (S_ISBLK(swap_file->f_dentry->d_inode->i_mode)) {
1036                 struct block_device *bdev;
1037                 bdev = swap_file->f_dentry->d_inode->i_bdev;
1038                 set_blocksize(bdev, p->old_block_size);
1039                 bd_release(bdev);
1040         }
1041         filp_close(swap_file, NULL);
1042         err = 0;
1043
1044 out_dput:
1045         filp_close(victim, NULL);
1046 out:
1047         return err;
1048 }
1049
1050 #ifdef CONFIG_PROC_FS
1051 /* iterator */
1052 static void *swap_start(struct seq_file *swap, loff_t *pos)
1053 {
1054         struct swap_info_struct *ptr = swap_info;
1055         int i;
1056         loff_t l = *pos;
1057         char * page = (char *) __get_free_page(GFP_KERNEL);
1058
1059         swap->private = page;   /* save for swap_show */
1060         swap_list_lock();
1061
1062         if (!page)
1063                 return ERR_PTR(-ENOMEM);
1064
1065         for (i = 0; i < nr_swapfiles; i++, ptr++) {
1066                 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1067                         continue;
1068                 if (!l--)
1069                         return ptr;
1070         }
1071
1072         return NULL;
1073 }
1074
1075 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1076 {
1077         struct swap_info_struct *ptr = v;
1078         void *endptr = (void *) swap_info + nr_swapfiles * sizeof(struct swap_info_struct);
1079
1080         for (++ptr; ptr < (struct swap_info_struct *) endptr; ptr++) {
1081                 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
1082                         continue;
1083                 ++*pos;
1084                 return ptr;
1085         }
1086
1087         return NULL;
1088 }
1089
1090 static void swap_stop(struct seq_file *swap, void *v)
1091 {
1092         swap_list_unlock();
1093         free_page((unsigned long) swap->private);
1094         swap->private = NULL;
1095 }
1096
1097 static int swap_show(struct seq_file *swap, void *v)
1098 {
1099         struct swap_info_struct *ptr = v;
1100         int j, usedswap;
1101         struct file *file;
1102         char *path;
1103
1104         if (v == swap_info)
1105                 seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1106
1107         file = ptr->swap_file;
1108         path = d_path(file->f_dentry, file->f_vfsmnt, swap->private, PAGE_SIZE);
1109
1110         for (j = 0, usedswap = 0; j < ptr->max; ++j)
1111                 switch (ptr->swap_map[j]) {
1112                         case SWAP_MAP_BAD:
1113                         case 0:
1114                                 continue;
1115                         default:
1116                                 usedswap++;
1117                 }
1118         seq_printf(swap, "%-39s %s\t%d\t%d\t%d\n",
1119                        path,
1120                        S_ISBLK(file->f_dentry->d_inode->i_mode) ?
1121                                 "partition" : "file\t",
1122                        ptr->pages << (PAGE_SHIFT - 10),
1123                        usedswap << (PAGE_SHIFT - 10),
1124                        ptr->prio);
1125         return 0;
1126 }
1127
1128 static struct seq_operations swaps_op = {
1129         .start =        swap_start,
1130         .next =         swap_next,
1131         .stop =         swap_stop,
1132         .show =         swap_show
1133 };
1134
1135 static int swaps_open(struct inode *inode, struct file *file)
1136 {
1137         return seq_open(file, &swaps_op);
1138 }
1139
1140 static struct file_operations proc_swaps_operations = {
1141         .open           = swaps_open,
1142         .read           = seq_read,
1143         .llseek         = seq_lseek,
1144         .release        = seq_release,
1145 };
1146
1147 static int __init procswaps_init(void)
1148 {
1149         struct proc_dir_entry *entry;
1150
1151         entry = create_proc_entry("swaps", 0, NULL);
1152         if (entry)
1153                 entry->proc_fops = &proc_swaps_operations;
1154         return 0;
1155 }
1156 __initcall(procswaps_init);
1157 #endif /* CONFIG_PROC_FS */
1158
1159 /*
1160  * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1161  *
1162  * The swapon system call
1163  */
1164 asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
1165 {
1166         struct swap_info_struct * p;
1167         char *name = NULL;
1168         struct block_device *bdev = NULL;
1169         struct file *swap_file = NULL;
1170         struct address_space *mapping;
1171         unsigned int type;
1172         int i, prev;
1173         int error;
1174         static int least_priority = 0;
1175         union swap_header *swap_header = 0;
1176         int swap_header_version;
1177         int nr_good_pages = 0;
1178         unsigned long maxpages = 1;
1179         int swapfilesize;
1180         unsigned short *swap_map;
1181         struct page *page = NULL;
1182
1183         if (!capable(CAP_SYS_ADMIN))
1184                 return -EPERM;
1185         swap_list_lock();
1186         p = swap_info;
1187         for (type = 0 ; type < nr_swapfiles ; type++,p++)
1188                 if (!(p->flags & SWP_USED))
1189                         break;
1190         error = -EPERM;
1191         if (type >= MAX_SWAPFILES) {
1192                 swap_list_unlock();
1193                 goto out;
1194         }
1195         if (type >= nr_swapfiles)
1196                 nr_swapfiles = type+1;
1197         INIT_LIST_HEAD(&p->extent_list);
1198         p->flags = SWP_USED;
1199         p->nr_extents = 0;
1200         p->swap_file = NULL;
1201         p->old_block_size = 0;
1202         p->swap_map = NULL;
1203         p->lowest_bit = 0;
1204         p->highest_bit = 0;
1205         p->cluster_nr = 0;
1206         p->sdev_lock = SPIN_LOCK_UNLOCKED;
1207         p->next = -1;
1208         if (swap_flags & SWAP_FLAG_PREFER) {
1209                 p->prio =
1210                   (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
1211         } else {
1212                 p->prio = --least_priority;
1213         }
1214         swap_list_unlock();
1215         name = getname(specialfile);
1216         error = PTR_ERR(name);
1217         if (IS_ERR(name))
1218                 goto bad_swap_2;
1219         swap_file = filp_open(name, O_RDWR, 0);
1220         error = PTR_ERR(swap_file);
1221         if (IS_ERR(swap_file)) {
1222                 swap_file = NULL;
1223                 goto bad_swap_2;
1224         }
1225
1226         p->swap_file = swap_file;
1227
1228         error = -EINVAL;
1229         if (S_ISBLK(swap_file->f_dentry->d_inode->i_mode)) {
1230                 bdev = swap_file->f_dentry->d_inode->i_bdev;
1231                 error = bd_claim(bdev, sys_swapon);
1232                 if (error < 0) {
1233                         bdev = NULL;
1234                         goto bad_swap;
1235                 }
1236                 p->old_block_size = block_size(bdev);
1237                 error = set_blocksize(swap_file->f_dentry->d_inode->i_bdev,
1238                                       PAGE_SIZE);
1239                 if (error < 0)
1240                         goto bad_swap;
1241                 p->bdev = bdev;
1242         } else if (S_ISREG(swap_file->f_dentry->d_inode->i_mode)) {
1243                 p->bdev = swap_file->f_dentry->d_inode->i_sb->s_bdev;
1244         } else {
1245                 goto bad_swap;
1246         }
1247
1248         mapping = swap_file->f_dentry->d_inode->i_mapping;
1249         swapfilesize = mapping->host->i_size >> PAGE_SHIFT;
1250
1251         error = -EBUSY;
1252         for (i = 0 ; i < nr_swapfiles ; i++) {
1253                 struct swap_info_struct *q = &swap_info[i];
1254                 if (i == type || !q->swap_file)
1255                         continue;
1256                 if (mapping == q->swap_file->f_dentry->d_inode->i_mapping)
1257                         goto bad_swap;
1258         }
1259
1260         /*
1261          * Read the swap header.
1262          */
1263         page = read_cache_page(mapping, 0,
1264                         (filler_t *)mapping->a_ops->readpage, swap_file);
1265         if (IS_ERR(page)) {
1266                 error = PTR_ERR(page);
1267                 goto bad_swap;
1268         }
1269         wait_on_page_locked(page);
1270         if (!PageUptodate(page))
1271                 goto bad_swap;
1272         kmap(page);
1273         swap_header = page_address(page);
1274
1275         if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
1276                 swap_header_version = 1;
1277         else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1278                 swap_header_version = 2;
1279         else {
1280                 printk("Unable to find swap-space signature\n");
1281                 error = -EINVAL;
1282                 goto bad_swap;
1283         }
1284
1285         switch (swap_header_version) {
1286         case 1:
1287                 printk(KERN_ERR "version 0 swap is no longer supported. "
1288                         "Use mkswap -v1 %s\n", name);
1289                 error = -EINVAL;
1290                 goto bad_swap;
1291         case 2:
1292                 /* Check the swap header's sub-version and the size of
1293                    the swap file and bad block lists */
1294                 if (swap_header->info.version != 1) {
1295                         printk(KERN_WARNING
1296                                "Unable to handle swap header version %d\n",
1297                                swap_header->info.version);
1298                         error = -EINVAL;
1299                         goto bad_swap;
1300                 }
1301
1302                 p->lowest_bit  = 1;
1303                 maxpages = swp_offset(swp_entry(0,~0UL)) - 1;
1304                 if (maxpages > swap_header->info.last_page)
1305                         maxpages = swap_header->info.last_page;
1306                 p->highest_bit = maxpages - 1;
1307
1308                 error = -EINVAL;
1309                 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1310                         goto bad_swap;
1311
1312                 /* OK, set up the swap map and apply the bad block list */
1313                 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1314                         error = -ENOMEM;
1315                         goto bad_swap;
1316                 }
1317
1318                 error = 0;
1319                 memset(p->swap_map, 0, maxpages * sizeof(short));
1320                 for (i=0; i<swap_header->info.nr_badpages; i++) {
1321                         int page = swap_header->info.badpages[i];
1322                         if (page <= 0 || page >= swap_header->info.last_page)
1323                                 error = -EINVAL;
1324                         else
1325                                 p->swap_map[page] = SWAP_MAP_BAD;
1326                 }
1327                 nr_good_pages = swap_header->info.last_page -
1328                                 swap_header->info.nr_badpages -
1329                                 1 /* header page */;
1330                 if (error)
1331                         goto bad_swap;
1332         }
1333
1334         if (swapfilesize && maxpages > swapfilesize) {
1335                 printk(KERN_WARNING
1336                        "Swap area shorter than signature indicates\n");
1337                 error = -EINVAL;
1338                 goto bad_swap;
1339         }
1340         if (!nr_good_pages) {
1341                 printk(KERN_WARNING "Empty swap-file\n");
1342                 error = -EINVAL;
1343                 goto bad_swap;
1344         }
1345         p->swap_map[0] = SWAP_MAP_BAD;
1346         p->max = maxpages;
1347         p->pages = nr_good_pages;
1348
1349         if (setup_swap_extents(p))
1350                 goto bad_swap;
1351
1352         swap_list_lock();
1353         swap_device_lock(p);
1354         p->flags = SWP_ACTIVE;
1355         nr_swap_pages += nr_good_pages;
1356         total_swap_pages += nr_good_pages;
1357         printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
1358                 nr_good_pages<<(PAGE_SHIFT-10), name,
1359                 p->prio, p->nr_extents);
1360
1361         /* insert swap space into swap_list: */
1362         prev = -1;
1363         for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1364                 if (p->prio >= swap_info[i].prio) {
1365                         break;
1366                 }
1367                 prev = i;
1368         }
1369         p->next = i;
1370         if (prev < 0) {
1371                 swap_list.head = swap_list.next = p - swap_info;
1372         } else {
1373                 swap_info[prev].next = p - swap_info;
1374         }
1375         swap_device_unlock(p);
1376         swap_list_unlock();
1377         error = 0;
1378         goto out;
1379 bad_swap:
1380         if (bdev) {
1381                 set_blocksize(bdev, p->old_block_size);
1382                 bd_release(bdev);
1383         }
1384 bad_swap_2:
1385         swap_list_lock();
1386         swap_map = p->swap_map;
1387         p->swap_file = NULL;
1388         p->swap_map = NULL;
1389         p->flags = 0;
1390         if (!(swap_flags & SWAP_FLAG_PREFER))
1391                 ++least_priority;
1392         swap_list_unlock();
1393         destroy_swap_extents(p);
1394         if (swap_map)
1395                 vfree(swap_map);
1396         if (swap_file && !IS_ERR(swap_file))
1397                 filp_close(swap_file, NULL);
1398 out:
1399         if (page && !IS_ERR(page)) {
1400                 kunmap(page);
1401                 page_cache_release(page);
1402         }
1403         if (name)
1404                 putname(name);
1405         return error;
1406 }
1407
1408 void si_swapinfo(struct sysinfo *val)
1409 {
1410         unsigned int i;
1411         unsigned long nr_to_be_unused = 0;
1412
1413         swap_list_lock();
1414         for (i = 0; i < nr_swapfiles; i++) {
1415                 unsigned int j;
1416                 if (!(swap_info[i].flags & SWP_USED) ||
1417                      (swap_info[i].flags & SWP_WRITEOK))
1418                         continue;
1419                 for (j = 0; j < swap_info[i].max; ++j) {
1420                         switch (swap_info[i].swap_map[j]) {
1421                                 case 0:
1422                                 case SWAP_MAP_BAD:
1423                                         continue;
1424                                 default:
1425                                         nr_to_be_unused++;
1426                         }
1427                 }
1428         }
1429         val->freeswap = nr_swap_pages + nr_to_be_unused;
1430         val->totalswap = total_swap_pages + nr_to_be_unused;
1431         swap_list_unlock();
1432 }
1433
1434 /*
1435  * Verify that a swap entry is valid and increment its swap map count.
1436  *
1437  * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
1438  * "permanent", but will be reclaimed by the next swapoff.
1439  */
1440 int swap_duplicate(swp_entry_t entry)
1441 {
1442         struct swap_info_struct * p;
1443         unsigned long offset, type;
1444         int result = 0;
1445
1446         type = swp_type(entry);
1447         if (type >= nr_swapfiles)
1448                 goto bad_file;
1449         p = type + swap_info;
1450         offset = swp_offset(entry);
1451
1452         swap_device_lock(p);
1453         if (offset < p->max && p->swap_map[offset]) {
1454                 if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
1455                         p->swap_map[offset]++;
1456                         result = 1;
1457                 } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
1458                         if (swap_overflow++ < 5)
1459                                 printk(KERN_WARNING "swap_dup: swap entry overflow\n");
1460                         p->swap_map[offset] = SWAP_MAP_MAX;
1461                         result = 1;
1462                 }
1463         }
1464         swap_device_unlock(p);
1465 out:
1466         return result;
1467
1468 bad_file:
1469         printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1470         goto out;
1471 }
1472
1473 struct swap_info_struct *
1474 get_swap_info_struct(unsigned type)
1475 {
1476         return &swap_info[type];
1477 }
1478
1479 /*
1480  * swap_device_lock prevents swap_map being freed. Don't grab an extra
1481  * reference on the swaphandle, it doesn't matter if it becomes unused.
1482  */
1483 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1484 {
1485         int ret = 0, i = 1 << page_cluster;
1486         unsigned long toff;
1487         struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
1488
1489         if (!page_cluster)      /* no readahead */
1490                 return 0;
1491         toff = (swp_offset(entry) >> page_cluster) << page_cluster;
1492         if (!toff)              /* first page is swap header */
1493                 toff++, i--;
1494         *offset = toff;
1495
1496         swap_device_lock(swapdev);
1497         do {
1498                 /* Don't read-ahead past the end of the swap area */
1499                 if (toff >= swapdev->max)
1500                         break;
1501                 /* Don't read in free or bad pages */
1502                 if (!swapdev->swap_map[toff])
1503                         break;
1504                 if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
1505                         break;
1506                 toff++;
1507                 ret++;
1508         } while (--i);
1509         swap_device_unlock(swapdev);
1510         return ret;
1511 }