mm/shmem.c

   1 /*
   2  * Resizable virtual memory filesystem for Linux.
   3  *
   4  * Copyright (C) 2000 Linus Torvalds.
   5  *               2000 Transmeta Corp.
   6  *               2000-2001 Christoph Rohland
   7  *               2000-2001 SAP AG
   8  *               2002 Red Hat Inc.
   9  * Copyright (C) 2002-2003 Hugh Dickins.
  10  * Copyright (C) 2002-2003 VERITAS Software Corporation.
  11  *
  12  * This file is released under the GPL.
  13  */
  14
  15 /*
  16  * This virtual memory filesystem is heavily based on the ramfs. It
  17  * extends ramfs by the ability to use swap and honor resource limits
  18  * which makes it a completely usable filesystem.
  19  */
  20
  21 #include <linux/config.h>
  22 #include <linux/module.h>
  23 #include <linux/init.h>
  24 #include <linux/devfs_fs_kernel.h>
  25 #include <linux/fs.h>
  26 #include <linux/mm.h>
  27 #include <linux/mman.h>
  28 #include <linux/file.h>
  29 #include <linux/swap.h>
  30 #include <linux/pagemap.h>
  31 #include <linux/string.h>
  32 #include <linux/slab.h>
  33 #include <linux/backing-dev.h>
  34 #include <linux/shmem_fs.h>
  35 #include <linux/mount.h>
  36 #include <linux/writeback.h>
  37 #include <linux/vfs.h>
  38 #include <linux/blkdev.h>
  39 #include <linux/security.h>
  40 #include <asm/uaccess.h>
  41 #include <asm/div64.h>
  42
  43 /* This magic number is used in glibc for posix shared memory */
  44 #define TMPFS_MAGIC     0x01021994
  45
  46 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
  47 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
  48 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
  49
  50 #define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
  51 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
  52
  53 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
  54
  55 /* Pretend that each entry is of this size in directory's i_size */
  56 #define BOGO_DIRENT_SIZE 20
  57
  58 /* Keep swapped page count in private field of indirect struct page */
  59 #define nr_swapped              private
  60
  61 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
  62 enum sgp_type {
  63         SGP_QUICK,      /* don't try more than file page cache lookup */
  64         SGP_READ,       /* don't exceed i_size, don't allocate page */
  65         SGP_CACHE,      /* don't exceed i_size, may allocate page */
  66         SGP_WRITE,      /* may exceed i_size, may allocate page */
  67 };
  68
  69 static int shmem_getpage(struct inode *inode, unsigned long idx,
  70                          struct page **pagep, enum sgp_type sgp);
  71
  72 static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
  73 {
  74         /*
  75          * The above definition of ENTRIES_PER_PAGE, and the use of
  76          * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
  77          * might be reconsidered if it ever diverges from PAGE_SIZE.
  78          */
  79         return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
  80 }
  81
  82 static inline void shmem_dir_free(struct page *page)
  83 {
  84         __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
  85 }
  86
  87 static struct page **shmem_dir_map(struct page *page)
  88 {
  89         return (struct page **)kmap_atomic(page, KM_USER0);
  90 }
  91
  92 static inline void shmem_dir_unmap(struct page **dir)
  93 {
  94         kunmap_atomic(dir, KM_USER0);
  95 }
  96
  97 static swp_entry_t *shmem_swp_map(struct page *page)
  98 {
  99         /*
 100          * We have to avoid the unconditional inc_preempt_count()
 101          * in kmap_atomic(), since shmem_swp_unmap() will also be
 102          * applied to the low memory addresses within i_direct[].
 103          * PageHighMem and high_memory tests are good for all arches
 104          * and configs: highmem_start_page and FIXADDR_START are not.
 105          */
 106         return PageHighMem(page)?
 107                 (swp_entry_t *)kmap_atomic(page, KM_USER1):
 108                 (swp_entry_t *)page_address(page);
 109 }
 110
 111 static inline void shmem_swp_unmap(swp_entry_t *entry)
 112 {
 113         if (entry >= (swp_entry_t *)high_memory)
 114                 kunmap_atomic(entry, KM_USER1);
 115 }
 116
 117 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 118 {
 119         return sb->s_fs_info;
 120 }
 121
 122 static struct super_operations shmem_ops;
 123 static struct address_space_operations shmem_aops;
 124 static struct file_operations shmem_file_operations;
 125 static struct inode_operations shmem_inode_operations;
 126 static struct inode_operations shmem_dir_inode_operations;
 127 static struct vm_operations_struct shmem_vm_ops;
 128
 129 static struct backing_dev_info shmem_backing_dev_info = {
 130         .ra_pages       = 0,    /* No readahead */
 131         .memory_backed  = 1,    /* Does not contribute to dirty memory */
 132 };
 133
 134 LIST_HEAD(shmem_inodes);
 135 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
 136
 137 static void shmem_free_block(struct inode *inode)
 138 {
 139         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 140         spin_lock(&sbinfo->stat_lock);
 141         sbinfo->free_blocks++;
 142         inode->i_blocks -= BLOCKS_PER_PAGE;
 143         spin_unlock(&sbinfo->stat_lock);
 144 }
 145
 146 /*
 147  * shmem_recalc_inode - recalculate the size of an inode
 148  *
 149  * @inode: inode to recalc
 150  *
 151  * We have to calculate the free blocks since the mm can drop
 152  * undirtied hole pages behind our back.
 153  *
 154  * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 155  * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 156  *
 157  * It has to be called with the spinlock held.
 158  */
 159 static void shmem_recalc_inode(struct inode *inode)
 160 {
 161         struct shmem_inode_info *info = SHMEM_I(inode);
 162         long freed;
 163
 164         freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
 165         if (freed > 0) {
 166                 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 167                 info->alloced -= freed;
 168                 spin_lock(&sbinfo->stat_lock);
 169                 sbinfo->free_blocks += freed;
 170                 inode->i_blocks -= freed*BLOCKS_PER_PAGE;
 171                 spin_unlock(&sbinfo->stat_lock);
 172         }
 173 }
 174
 175 /*
 176  * shmem_swp_entry - find the swap vector position in the info structure
 177  *
 178  * @info:  info structure for the inode
 179  * @index: index of the page to find
 180  * @page:  optional page to add to the structure. Has to be preset to
 181  *         all zeros
 182  *
 183  * If there is no space allocated yet it will return NULL when
 184  * page is NULL, else it will use the page for the needed block,
 185  * setting it to NULL on return to indicate that it has been used.
 186  *
 187  * The swap vector is organized the following way:
 188  *
 189  * There are SHMEM_NR_DIRECT entries directly stored in the
 190  * shmem_inode_info structure. So small files do not need an addional
 191  * allocation.
 192  *
 193  * For pages with index > SHMEM_NR_DIRECT there is the pointer
 194  * i_indirect which points to a page which holds in the first half
 195  * doubly indirect blocks, in the second half triple indirect blocks:
 196  *
 197  * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
 198  * following layout (for SHMEM_NR_DIRECT == 16):
 199  *
 200  * i_indirect -> dir --> 16-19
 201  *            |      +-> 20-23
 202  *            |
 203  *            +-->dir2 --> 24-27
 204  *            |        +-> 28-31
 205  *            |        +-> 32-35
 206  *            |        +-> 36-39
 207  *            |
 208  *            +-->dir3 --> 40-43
 209  *                     +-> 44-47
 210  *                     +-> 48-51
 211  *                     +-> 52-55
 212  */
 213 static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
 214 {
 215         unsigned long offset;
 216         struct page **dir;
 217         struct page *subdir;
 218
 219         if (index < SHMEM_NR_DIRECT)
 220                 return info->i_direct+index;
 221         if (!info->i_indirect) {
 222                 if (page) {
 223                         info->i_indirect = *page;
 224                         *page = NULL;
 225                 }
 226                 return NULL;                    /* need another page */
 227         }
 228
 229         index -= SHMEM_NR_DIRECT;
 230         offset = index % ENTRIES_PER_PAGE;
 231         index /= ENTRIES_PER_PAGE;
 232         dir = shmem_dir_map(info->i_indirect);
 233
 234         if (index >= ENTRIES_PER_PAGE/2) {
 235                 index -= ENTRIES_PER_PAGE/2;
 236                 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 237                 index %= ENTRIES_PER_PAGE;
 238                 subdir = *dir;
 239                 if (!subdir) {
 240                         if (page) {
 241                                 *dir = *page;
 242                                 *page = NULL;
 243                         }
 244                         shmem_dir_unmap(dir);
 245                         return NULL;            /* need another page */
 246                 }
 247                 shmem_dir_unmap(dir);
 248                 dir = shmem_dir_map(subdir);
 249         }
 250
 251         dir += index;
 252         subdir = *dir;
 253         if (!subdir) {
 254                 if (!page || !(subdir = *page)) {
 255                         shmem_dir_unmap(dir);
 256                         return NULL;            /* need a page */
 257                 }
 258                 *dir = subdir;
 259                 *page = NULL;
 260         }
 261         shmem_dir_unmap(dir);
 262
 263         /*
 264          * With apologies... caller shmem_swp_alloc passes non-NULL
 265          * page (though perhaps NULL *page); and now we know that this
 266          * indirect page has been allocated, we can shortcut the final
 267          * kmap if we know it contains no swap entries, as is commonly
 268          * the case: return pointer to a 0 which doesn't need kmapping.
 269          */
 270         return (page && !subdir->nr_swapped)?
 271                 (swp_entry_t *)&subdir->nr_swapped:
 272                 shmem_swp_map(subdir) + offset;
 273 }
 274
 275 static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
 276 {
 277         long incdec = value? 1: -1;
 278
 279         entry->val = value;
 280         info->swapped += incdec;
 281         if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
 282                 kmap_atomic_to_page(entry)->nr_swapped += incdec;
 283 }
 284
 285 /*
 286  * shmem_swp_alloc - get the position of the swap entry for the page.
 287  *                   If it does not exist allocate the entry.
 288  *
 289  * @info:       info structure for the inode
 290  * @index:      index of the page to find
 291  * @sgp:        check and recheck i_size? skip allocation?
 292  */
 293 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
 294 {
 295         struct inode *inode = &info->vfs_inode;
 296         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 297         struct page *page = NULL;
 298         swp_entry_t *entry;
 299         static const swp_entry_t unswapped = {0};
 300
 301         if (sgp != SGP_WRITE &&
 302             ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
 303                 return ERR_PTR(-EINVAL);
 304
 305         while (!(entry = shmem_swp_entry(info, index, &page))) {
 306                 if (sgp == SGP_READ)
 307                         return (swp_entry_t *) &unswapped;
 308                 /*
 309                  * Test free_blocks against 1 not 0, since we have 1 data
 310                  * page (and perhaps indirect index pages) yet to allocate:
 311                  * a waste to allocate index if we cannot allocate data.
 312                  */
 313                 spin_lock(&sbinfo->stat_lock);
 314                 if (sbinfo->free_blocks <= 1) {
 315                         spin_unlock(&sbinfo->stat_lock);
 316                         return ERR_PTR(-ENOSPC);
 317                 }
 318                 sbinfo->free_blocks--;
 319                 inode->i_blocks += BLOCKS_PER_PAGE;
 320                 spin_unlock(&sbinfo->stat_lock);
 321
 322                 spin_unlock(&info->lock);
 323                 page = shmem_dir_alloc(inode->i_mapping->gfp_mask);
 324                 if (page) {
 325                         clear_highpage(page);
 326                         page->nr_swapped = 0;
 327                 }
 328                 spin_lock(&info->lock);
 329
 330                 if (!page) {
 331                         shmem_free_block(inode);
 332                         return ERR_PTR(-ENOMEM);
 333                 }
 334                 if (sgp != SGP_WRITE &&
 335                     ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
 336                         entry = ERR_PTR(-EINVAL);
 337                         break;
 338                 }
 339                 if (info->next_index <= index)
 340                         info->next_index = index + 1;
 341         }
 342         if (page) {
 343                 /* another task gave its page, or truncated the file */
 344                 shmem_free_block(inode);
 345                 shmem_dir_free(page);
 346         }
 347         if (info->next_index <= index && !IS_ERR(entry))
 348                 info->next_index = index + 1;
 349         return entry;
 350 }
 351
 352 /*
 353  * shmem_free_swp - free some swap entries in a directory
 354  *
 355  * @dir:   pointer to the directory
 356  * @edir:  pointer after last entry of the directory
 357  */
 358 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
 359 {
 360         swp_entry_t *ptr;
 361         int freed = 0;
 362
 363         for (ptr = dir; ptr < edir; ptr++) {
 364                 if (ptr->val) {
 365                         free_swap_and_cache(*ptr);
 366                         *ptr = (swp_entry_t){0};
 367                         freed++;
 368                 }
 369         }
 370         return freed;
 371 }
 372
 373 static void shmem_truncate(struct inode *inode)
 374 {
 375         struct shmem_inode_info *info = SHMEM_I(inode);
 376         unsigned long idx;
 377         unsigned long size;
 378         unsigned long limit;
 379         unsigned long stage;
 380         struct page **dir;
 381         struct page *subdir;
 382         struct page *empty;
 383         swp_entry_t *ptr;
 384         int offset;
 385         int freed;
 386
 387         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 388         idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 389         if (idx >= info->next_index)
 390                 return;
 391
 392         spin_lock(&info->lock);
 393         limit = info->next_index;
 394         info->next_index = idx;
 395         if (info->swapped && idx < SHMEM_NR_DIRECT) {
 396                 ptr = info->i_direct;
 397                 size = limit;
 398                 if (size > SHMEM_NR_DIRECT)
 399                         size = SHMEM_NR_DIRECT;
 400                 info->swapped -= shmem_free_swp(ptr+idx, ptr+size);
 401         }
 402         if (!info->i_indirect)
 403                 goto done2;
 404
 405         BUG_ON(limit <= SHMEM_NR_DIRECT);
 406         limit -= SHMEM_NR_DIRECT;
 407         idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
 408         offset = idx % ENTRIES_PER_PAGE;
 409         idx -= offset;
 410
 411         empty = NULL;
 412         dir = shmem_dir_map(info->i_indirect);
 413         stage = ENTRIES_PER_PAGEPAGE/2;
 414         if (idx < ENTRIES_PER_PAGEPAGE/2)
 415                 dir += idx/ENTRIES_PER_PAGE;
 416         else {
 417                 dir += ENTRIES_PER_PAGE/2;
 418                 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
 419                 while (stage <= idx)
 420                         stage += ENTRIES_PER_PAGEPAGE;
 421                 if (*dir) {
 422                         subdir = *dir;
 423                         size = ((idx - ENTRIES_PER_PAGEPAGE/2) %
 424                                 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
 425                         if (!size && !offset) {
 426                                 empty = subdir;
 427                                 *dir = NULL;
 428                         }
 429                         shmem_dir_unmap(dir);
 430                         dir = shmem_dir_map(subdir) + size;
 431                 } else {
 432                         offset = 0;
 433                         idx = stage;
 434                 }
 435         }
 436
 437         for (; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
 438                 if (unlikely(idx == stage)) {
 439                         shmem_dir_unmap(dir-1);
 440                         dir = shmem_dir_map(info->i_indirect) +
 441                             ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 442                         while (!*dir) {
 443                                 dir++;
 444                                 idx += ENTRIES_PER_PAGEPAGE;
 445                                 if (idx >= limit)
 446                                         goto done1;
 447                         }
 448                         stage = idx + ENTRIES_PER_PAGEPAGE;
 449                         subdir = *dir;
 450                         *dir = NULL;
 451                         shmem_dir_unmap(dir);
 452                         if (empty) {
 453                                 shmem_dir_free(empty);
 454                                 info->alloced++;
 455                         }
 456                         empty = subdir;
 457                         cond_resched_lock(&info->lock);
 458                         dir = shmem_dir_map(subdir);
 459                 }
 460                 subdir = *dir;
 461                 if (subdir && subdir->nr_swapped) {
 462                         ptr = shmem_swp_map(subdir);
 463                         size = limit - idx;
 464                         if (size > ENTRIES_PER_PAGE)
 465                                 size = ENTRIES_PER_PAGE;
 466                         freed = shmem_free_swp(ptr+offset, ptr+size);
 467                         shmem_swp_unmap(ptr);
 468                         info->swapped -= freed;
 469                         subdir->nr_swapped -= freed;
 470                         BUG_ON(subdir->nr_swapped > offset);
 471                 }
 472                 if (offset)
 473                         offset = 0;
 474                 else if (subdir) {
 475                         *dir = NULL;
 476                         shmem_dir_free(subdir);
 477                         info->alloced++;
 478                 }
 479         }
 480 done1:
 481         shmem_dir_unmap(dir-1);
 482         if (empty) {
 483                 shmem_dir_free(empty);
 484                 info->alloced++;
 485         }
 486         if (info->next_index <= SHMEM_NR_DIRECT) {
 487                 shmem_dir_free(info->i_indirect);
 488                 info->i_indirect = NULL;
 489                 info->alloced++;
 490         }
 491 done2:
 492         BUG_ON(info->swapped > info->next_index);
 493         shmem_recalc_inode(inode);
 494         spin_unlock(&info->lock);
 495 }
 496
 497 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 498 {
 499         struct inode *inode = dentry->d_inode;
 500         struct page *page = NULL;
 501         long change = 0;
 502         int error;
 503
 504         if ((attr->ia_valid & ATTR_SIZE) && (attr->ia_size <= SHMEM_MAX_BYTES)) {
 505                 /*
 506                  * Account swap file usage based on new file size,
 507                  * but just let vmtruncate fail on out-of-range sizes.
 508                  */
 509                 change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size);
 510                 if (change > 0) {
 511                         if (security_vm_enough_memory(change))
 512                                 return -ENOMEM;
 513                 } else if (attr->ia_size < inode->i_size) {
 514                         vm_unacct_memory(-change);
 515                         /*
 516                          * If truncating down to a partial page, then
 517                          * if that page is already allocated, hold it
 518                          * in memory until the truncation is over, so
 519                          * truncate_partial_page cannnot miss it were
 520                          * it assigned to swap.
 521                          */
 522                         if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
 523                                 (void) shmem_getpage(inode,
 524                                         attr->ia_size>>PAGE_CACHE_SHIFT,
 525                                                 &page, SGP_READ);
 526                         }
 527                 }
 528         }
 529
 530         error = inode_change_ok(inode, attr);
 531         if (!error)
 532                 error = inode_setattr(inode, attr);
 533         if (page)
 534                 page_cache_release(page);
 535         if (error)
 536                 vm_unacct_memory(change);
 537         return error;
 538 }
 539
 540 static void shmem_delete_inode(struct inode *inode)
 541 {
 542         struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 543         struct shmem_inode_info *info = SHMEM_I(inode);
 544
 545         if (inode->i_op->truncate == shmem_truncate) {
 546                 spin_lock(&shmem_ilock);
 547                 list_del(&info->list);
 548                 spin_unlock(&shmem_ilock);
 549                 if (info->flags & VM_ACCOUNT)
 550                         vm_unacct_memory(VM_ACCT(inode->i_size));
 551                 inode->i_size = 0;
 552                 shmem_truncate(inode);
 553         }
 554         BUG_ON(inode->i_blocks);
 555         spin_lock(&sbinfo->stat_lock);
 556         sbinfo->free_inodes++;
 557         spin_unlock(&sbinfo->stat_lock);
 558         clear_inode(inode);
 559 }
 560
 561 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 562 {
 563         swp_entry_t *ptr;
 564
 565         for (ptr = dir; ptr < edir; ptr++) {
 566                 if (ptr->val == entry.val)
 567                         return ptr - dir;
 568         }
 569         return -1;
 570 }
 571
 572 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 573 {
 574         struct inode *inode;
 575         unsigned long idx;
 576         unsigned long size;
 577         unsigned long limit;
 578         unsigned long stage;
 579         struct page **dir;
 580         struct page *subdir;
 581         swp_entry_t *ptr;
 582         int offset;
 583
 584         idx = 0;
 585         ptr = info->i_direct;
 586         spin_lock(&info->lock);
 587         limit = info->next_index;
 588         size = limit;
 589         if (size > SHMEM_NR_DIRECT)
 590                 size = SHMEM_NR_DIRECT;
 591         offset = shmem_find_swp(entry, ptr, ptr+size);
 592         if (offset >= 0)
 593                 goto found;
 594         if (!info->i_indirect)
 595                 goto lost2;
 596         /* we might be racing with shmem_truncate */
 597         if (limit <= SHMEM_NR_DIRECT)
 598                 goto lost2;
 599
 600         dir = shmem_dir_map(info->i_indirect);
 601         stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
 602
 603         for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
 604                 if (unlikely(idx == stage)) {
 605                         shmem_dir_unmap(dir-1);
 606                         dir = shmem_dir_map(info->i_indirect) +
 607                             ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
 608                         while (!*dir) {
 609                                 dir++;
 610                                 idx += ENTRIES_PER_PAGEPAGE;
 611                                 if (idx >= limit)
 612                                         goto lost1;
 613                         }
 614                         stage = idx + ENTRIES_PER_PAGEPAGE;
 615                         subdir = *dir;
 616                         shmem_dir_unmap(dir);
 617                         dir = shmem_dir_map(subdir);
 618                 }
 619                 subdir = *dir;
 620                 if (subdir && subdir->nr_swapped) {
 621                         ptr = shmem_swp_map(subdir);
 622                         size = limit - idx;
 623                         if (size > ENTRIES_PER_PAGE)
 624                                 size = ENTRIES_PER_PAGE;
 625                         offset = shmem_find_swp(entry, ptr, ptr+size);
 626                         if (offset >= 0) {
 627                                 shmem_dir_unmap(dir);
 628                                 goto found;
 629                         }
 630                         shmem_swp_unmap(ptr);
 631                 }
 632         }
 633 lost1:
 634         shmem_dir_unmap(dir-1);
 635 lost2:
 636         spin_unlock(&info->lock);
 637         return 0;
 638 found:
 639         idx += offset;
 640         inode = &info->vfs_inode;
 641
 642         /* Racing against delete or truncate? Must leave out of page cache */
 643         limit = (inode->i_state & I_FREEING)? 0:
 644                 (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 645
 646         if (idx >= limit ||
 647             move_from_swap_cache(page, idx, inode->i_mapping) == 0)
 648                 shmem_swp_set(info, ptr + offset, 0);
 649         shmem_swp_unmap(ptr);
 650         spin_unlock(&info->lock);
 651         /*
 652          * Decrement swap count even when the entry is left behind:
 653          * try_to_unuse will skip over mms, then reincrement count.
 654          */
 655         swap_free(entry);
 656         return idx < limit;
 657 }
 658
 659 /*
 660  * shmem_unuse() search for an eventually swapped out shmem page.
 661  */
 662 int shmem_unuse(swp_entry_t entry, struct page *page)
 663 {
 664         struct list_head *p;
 665         struct shmem_inode_info *info;
 666         int found = 0;
 667
 668         spin_lock(&shmem_ilock);
 669         list_for_each(p, &shmem_inodes) {
 670                 info = list_entry(p, struct shmem_inode_info, list);
 671
 672                 if (info->swapped && shmem_unuse_inode(info, entry, page)) {
 673                         /* move head to start search for next from here */
 674                         list_move_tail(&shmem_inodes, &info->list);
 675                         found = 1;
 676                         break;
 677                 }
 678         }
 679         spin_unlock(&shmem_ilock);
 680         return found;
 681 }
 682
 683 /*
 684  * Move the page from the page cache to the swap cache.
 685  */
 686 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 687 {
 688         struct shmem_inode_info *info;
 689         swp_entry_t *entry, swap;
 690         struct address_space *mapping;
 691         unsigned long index;
 692         struct inode *inode;
 693
 694         BUG_ON(!PageLocked(page));
 695         BUG_ON(page_mapped(page));
 696
 697         mapping = page->mapping;
 698         index = page->index;
 699         inode = mapping->host;
 700         info = SHMEM_I(inode);
 701         if (info->flags & VM_LOCKED)
 702                 goto redirty;
 703         swap = get_swap_page();
 704         if (!swap.val)
 705                 goto redirty;
 706
 707         spin_lock(&info->lock);
 708         shmem_recalc_inode(inode);
 709         BUG_ON(index >= info->next_index);
 710         entry = shmem_swp_entry(info, index, NULL);
 711         BUG_ON(!entry);
 712         BUG_ON(entry->val);
 713
 714         if (move_to_swap_cache(page, swap) == 0) {
 715                 shmem_swp_set(info, entry, swap.val);
 716                 shmem_swp_unmap(entry);
 717                 spin_unlock(&info->lock);
 718                 unlock_page(page);
 719                 return 0;
 720         }
 721
 722         shmem_swp_unmap(entry);
 723         spin_unlock(&info->lock);
 724         swap_free(swap);
 725 redirty:
 726         set_page_dirty(page);
 727         return WRITEPAGE_ACTIVATE;      /* Return with the page locked */
 728 }
 729
 730 /*
 731  * shmem_getpage - either get the page from swap or allocate a new one
 732  *
 733  * If we allocate a new one we do not mark it dirty. That's up to the
 734  * vm. If we swap it in we mark it dirty since we also free the swap
 735  * entry since a page cannot live in both the swap and page cache
 736  */
 737 static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp)
 738 {
 739         struct address_space *mapping = inode->i_mapping;
 740         struct shmem_inode_info *info = SHMEM_I(inode);
 741         struct shmem_sb_info *sbinfo;
 742         struct page *filepage = *pagep;
 743         struct page *swappage;
 744         swp_entry_t *entry;
 745         swp_entry_t swap;
 746         int error;
 747
 748         if (idx >= SHMEM_MAX_INDEX)
 749                 return -EFBIG;
 750         /*
 751          * Normally, filepage is NULL on entry, and either found
 752          * uptodate immediately, or allocated and zeroed, or read
 753          * in under swappage, which is then assigned to filepage.
 754          * But shmem_prepare_write passes in a locked filepage,
 755          * which may be found not uptodate by other callers too,
 756          * and may need to be copied from the swappage read in.
 757          */
 758 repeat:
 759         if (!filepage)
 760                 filepage = find_lock_page(mapping, idx);
 761         if (filepage && PageUptodate(filepage))
 762                 goto done;
 763         error = 0;
 764         if (sgp == SGP_QUICK)
 765                 goto failed;
 766
 767         spin_lock(&info->lock);
 768         shmem_recalc_inode(inode);
 769         entry = shmem_swp_alloc(info, idx, sgp);
 770         if (IS_ERR(entry)) {
 771                 spin_unlock(&info->lock);
 772                 error = PTR_ERR(entry);
 773                 goto failed;
 774         }
 775         swap = *entry;
 776
 777         if (swap.val) {
 778                 /* Look it up and read it in.. */
 779                 swappage = lookup_swap_cache(swap);
 780                 if (!swappage) {
 781                         shmem_swp_unmap(entry);
 782                         spin_unlock(&info->lock);
 783                         swapin_readahead(swap);
 784                         swappage = read_swap_cache_async(swap);
 785                         if (!swappage) {
 786                                 spin_lock(&info->lock);
 787                                 entry = shmem_swp_alloc(info, idx, sgp);
 788                                 if (IS_ERR(entry))
 789                                         error = PTR_ERR(entry);
 790                                 else {
 791                                         if (entry->val == swap.val)
 792                                                 error = -ENOMEM;
 793                                         shmem_swp_unmap(entry);
 794                                 }
 795                                 spin_unlock(&info->lock);
 796                                 if (error)
 797                                         goto failed;
 798                                 goto repeat;
 799                         }
 800                         wait_on_page_locked(swappage);
 801                         page_cache_release(swappage);
 802                         goto repeat;
 803                 }
 804
 805                 /* We have to do this with page locked to prevent races */
 806                 if (TestSetPageLocked(swappage)) {
 807                         shmem_swp_unmap(entry);
 808                         spin_unlock(&info->lock);
 809                         wait_on_page_locked(swappage);
 810                         page_cache_release(swappage);
 811                         goto repeat;
 812                 }
 813                 if (PageWriteback(swappage)) {
 814                         shmem_swp_unmap(entry);
 815                         spin_unlock(&info->lock);
 816                         wait_on_page_writeback(swappage);
 817                         unlock_page(swappage);
 818                         page_cache_release(swappage);
 819                         goto repeat;
 820                 }
 821                 if (!PageUptodate(swappage)) {
 822                         shmem_swp_unmap(entry);
 823                         spin_unlock(&info->lock);
 824                         unlock_page(swappage);
 825                         page_cache_release(swappage);
 826                         error = -EIO;
 827                         goto failed;
 828                 }
 829
 830                 if (filepage) {
 831                         shmem_swp_set(info, entry, 0);
 832                         shmem_swp_unmap(entry);
 833                         delete_from_swap_cache(swappage);
 834                         spin_unlock(&info->lock);
 835                         copy_highpage(filepage, swappage);
 836                         unlock_page(swappage);
 837                         page_cache_release(swappage);
 838                         flush_dcache_page(filepage);
 839                         SetPageUptodate(filepage);
 840                         set_page_dirty(filepage);
 841                         swap_free(swap);
 842                 } else if (!(error = move_from_swap_cache(
 843                                 swappage, idx, mapping))) {
 844                         shmem_swp_set(info, entry, 0);
 845                         shmem_swp_unmap(entry);
 846                         spin_unlock(&info->lock);
 847                         filepage = swappage;
 848                         swap_free(swap);
 849                 } else {
 850                         shmem_swp_unmap(entry);
 851                         spin_unlock(&info->lock);
 852                         unlock_page(swappage);
 853                         page_cache_release(swappage);
 854                         if (error == -ENOMEM) {
 855                                 /* let kswapd refresh zone for GFP_ATOMICs */
 856                                 blk_congestion_wait(WRITE, HZ/50);
 857                         }
 858                         goto repeat;
 859                 }
 860         } else if (sgp == SGP_READ && !filepage) {
 861                 shmem_swp_unmap(entry);
 862                 filepage = find_get_page(mapping, idx);
 863                 if (filepage &&
 864                     (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
 865                         spin_unlock(&info->lock);
 866                         wait_on_page_locked(filepage);
 867                         page_cache_release(filepage);
 868                         filepage = NULL;
 869                         goto repeat;
 870                 }
 871                 spin_unlock(&info->lock);
 872         } else {
 873                 shmem_swp_unmap(entry);
 874                 sbinfo = SHMEM_SB(inode->i_sb);
 875                 spin_lock(&sbinfo->stat_lock);
 876                 if (sbinfo->free_blocks == 0) {
 877                         spin_unlock(&sbinfo->stat_lock);
 878                         spin_unlock(&info->lock);
 879                         error = -ENOSPC;
 880                         goto failed;
 881                 }
 882                 sbinfo->free_blocks--;
 883                 inode->i_blocks += BLOCKS_PER_PAGE;
 884                 spin_unlock(&sbinfo->stat_lock);
 885
 886                 if (!filepage) {
 887                         spin_unlock(&info->lock);
 888                         filepage = page_cache_alloc(mapping);
 889                         if (!filepage) {
 890                                 shmem_free_block(inode);
 891                                 error = -ENOMEM;
 892                                 goto failed;
 893                         }
 894
 895                         spin_lock(&info->lock);
 896                         entry = shmem_swp_alloc(info, idx, sgp);
 897                         if (IS_ERR(entry))
 898                                 error = PTR_ERR(entry);
 899                         else {
 900                                 swap = *entry;
 901                                 shmem_swp_unmap(entry);
 902                         }
 903                         if (error || swap.val || 0 != add_to_page_cache_lru(
 904                                         filepage, mapping, idx, GFP_ATOMIC)) {
 905                                 spin_unlock(&info->lock);
 906                                 page_cache_release(filepage);
 907                                 shmem_free_block(inode);
 908                                 filepage = NULL;
 909                                 if (error)
 910                                         goto failed;
 911                                 goto repeat;
 912                         }
 913                 }
 914
 915                 info->alloced++;
 916                 spin_unlock(&info->lock);
 917                 clear_highpage(filepage);
 918                 flush_dcache_page(filepage);
 919                 SetPageUptodate(filepage);
 920         }
 921 done:
 922         if (!*pagep) {
 923                 if (filepage) {
 924                         unlock_page(filepage);
 925                         *pagep = filepage;
 926                 } else
 927                         *pagep = ZERO_PAGE(0);
 928         }
 929         return 0;
 930
 931 failed:
 932         if (*pagep != filepage) {
 933                 unlock_page(filepage);
 934                 page_cache_release(filepage);
 935         }
 936         return error;
 937 }
 938
 939 struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
 940 {
 941         struct inode *inode = vma->vm_file->f_dentry->d_inode;
 942         struct page *page = NULL;
 943         unsigned long idx;
 944         int error;
 945
 946         idx = (address - vma->vm_start) >> PAGE_SHIFT;
 947         idx += vma->vm_pgoff;
 948         idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
 949
 950         error = shmem_getpage(inode, idx, &page, SGP_CACHE);
 951         if (error)
 952                 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
 953
 954         mark_page_accessed(page);
 955         return page;
 956 }
 957
 958 static int shmem_populate(struct vm_area_struct *vma,
 959         unsigned long addr, unsigned long len,
 960         pgprot_t prot, unsigned long pgoff, int nonblock)
 961 {
 962         struct inode *inode = vma->vm_file->f_dentry->d_inode;
 963         struct mm_struct *mm = vma->vm_mm;
 964         enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
 965         unsigned long size;
 966
 967         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 968         if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
 969                 return -EINVAL;
 970
 971         while ((long) len > 0) {
 972                 struct page *page = NULL;
 973                 int err;
 974                 /*
 975                  * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
 976                  */
 977                 err = shmem_getpage(inode, pgoff, &page, sgp);
 978                 if (err)
 979                         return err;
 980                 if (page) {
 981                         mark_page_accessed(page);
 982                         err = install_page(mm, vma, addr, page, prot);
 983                         if (err) {
 984                                 page_cache_release(page);
 985                                 return err;
 986                         }
 987                 }
 988                 len -= PAGE_SIZE;
 989                 addr += PAGE_SIZE;
 990                 pgoff++;
 991         }
 992         return 0;
 993 }
 994
 995 void shmem_lock(struct file *file, int lock)
 996 {
 997         struct inode *inode = file->f_dentry->d_inode;
 998         struct shmem_inode_info *info = SHMEM_I(inode);
 999
1000         spin_lock(&info->lock);
1001         if (lock)
1002                 info->flags |= VM_LOCKED;
1003         else
1004                 info->flags &= ~VM_LOCKED;
1005         spin_unlock(&info->lock);
1006 }
1007
1008 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1009 {
1010         struct vm_operations_struct *ops;
1011         struct inode *inode = file->f_dentry->d_inode;
1012
1013         ops = &shmem_vm_ops;
1014         if (!S_ISREG(inode->i_mode))
1015                 return -EACCES;
1016         update_atime(inode);
1017         vma->vm_ops = ops;
1018         return 0;
1019 }
1020
1021 static struct inode *
1022 shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1023 {
1024         struct inode *inode;
1025         struct shmem_inode_info *info;
1026         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1027
1028         spin_lock(&sbinfo->stat_lock);
1029         if (!sbinfo->free_inodes) {
1030                 spin_unlock(&sbinfo->stat_lock);
1031                 return NULL;
1032         }
1033         sbinfo->free_inodes--;
1034         spin_unlock(&sbinfo->stat_lock);
1035
1036         inode = new_inode(sb);
1037         if (inode) {
1038                 inode->i_mode = mode;
1039                 inode->i_uid = current->fsuid;
1040                 inode->i_gid = current->fsgid;
1041                 inode->i_blksize = PAGE_CACHE_SIZE;
1042                 inode->i_blocks = 0;
1043                 inode->i_rdev = NODEV;
1044                 inode->i_mapping->a_ops = &shmem_aops;
1045                 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1046                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1047                 info = SHMEM_I(inode);
1048                 memset(info, 0, (char *)inode - (char *)info);
1049                 spin_lock_init(&info->lock);
1050                 info->flags = VM_ACCOUNT;
1051                 switch (mode & S_IFMT) {
1052                 default:
1053                         init_special_inode(inode, mode, dev);
1054                         break;
1055                 case S_IFREG:
1056                         inode->i_op = &shmem_inode_operations;
1057                         inode->i_fop = &shmem_file_operations;
1058                         spin_lock(&shmem_ilock);
1059                         list_add_tail(&info->list, &shmem_inodes);
1060                         spin_unlock(&shmem_ilock);
1061                         break;
1062                 case S_IFDIR:
1063                         inode->i_nlink++;
1064                         /* Some things misbehave if size == 0 on a directory */
1065                         inode->i_size = 2 * BOGO_DIRENT_SIZE;
1066                         inode->i_op = &shmem_dir_inode_operations;
1067                         inode->i_fop = &simple_dir_operations;
1068                         break;
1069                 case S_IFLNK:
1070                         break;
1071                 }
1072         }
1073         return inode;
1074 }
1075
1076 static int shmem_set_size(struct shmem_sb_info *info,
1077                           unsigned long max_blocks, unsigned long max_inodes)
1078 {
1079         int error;
1080         unsigned long blocks, inodes;
1081
1082         spin_lock(&info->stat_lock);
1083         blocks = info->max_blocks - info->free_blocks;
1084         inodes = info->max_inodes - info->free_inodes;
1085         error = -EINVAL;
1086         if (max_blocks < blocks)
1087                 goto out;
1088         if (max_inodes < inodes)
1089                 goto out;
1090         error = 0;
1091         info->max_blocks  = max_blocks;
1092         info->free_blocks = max_blocks - blocks;
1093         info->max_inodes  = max_inodes;
1094         info->free_inodes = max_inodes - inodes;
1095 out:
1096         spin_unlock(&info->stat_lock);
1097         return error;
1098 }
1099
1100 #ifdef CONFIG_TMPFS
1101
1102 static struct inode_operations shmem_symlink_inode_operations;
1103 static struct inode_operations shmem_symlink_inline_operations;
1104
1105 /*
1106  * Normally tmpfs makes no use of shmem_prepare_write, but it
1107  * lets a tmpfs file be used read-write below the loop driver.
1108  */
1109 static int
1110 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1111 {
1112         struct inode *inode = page->mapping->host;
1113         return shmem_getpage(inode, page->index, &page, SGP_WRITE);
1114 }
1115
1116 static ssize_t
1117 shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1118 {
1119         struct inode    *inode = file->f_dentry->d_inode;
1120         loff_t          pos;
1121         unsigned long   written;
1122         int             err;
1123         loff_t          maxpos;
1124
1125         if ((ssize_t) count < 0)
1126                 return -EINVAL;
1127
1128         if (!access_ok(VERIFY_READ, buf, count))
1129                 return -EFAULT;
1130
1131         down(&inode->i_sem);
1132
1133         pos = *ppos;
1134         written = 0;
1135
1136         err = generic_write_checks(inode, file, &pos, &count, 0);
1137         if (err || !count)
1138                 goto out;
1139
1140         maxpos = inode->i_size;
1141         if (maxpos < pos + count) {
1142                 maxpos = pos + count;
1143                 if (security_vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) {
1144                         err = -ENOMEM;
1145                         goto out;
1146                 }
1147         }
1148
1149         remove_suid(file->f_dentry);
1150         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1151
1152         do {
1153                 struct page *page = NULL;
1154                 unsigned long bytes, index, offset;
1155                 char *kaddr;
1156                 int left;
1157
1158                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1159                 index = pos >> PAGE_CACHE_SHIFT;
1160                 bytes = PAGE_CACHE_SIZE - offset;
1161                 if (bytes > count)
1162                         bytes = count;
1163
1164                 /*
1165                  * We don't hold page lock across copy from user -
1166                  * what would it guard against? - so no deadlock here.
1167                  * But it still may be a good idea to prefault below.
1168                  */
1169
1170                 err = shmem_getpage(inode, index, &page, SGP_WRITE);
1171                 if (err)
1172                         break;
1173
1174                 left = bytes;
1175                 if (PageHighMem(page)) {
1176                         volatile unsigned char dummy;
1177                         __get_user(dummy, buf);
1178                         __get_user(dummy, buf + bytes - 1);
1179
1180                         kaddr = kmap_atomic(page, KM_USER0);
1181                         left = __copy_from_user(kaddr + offset, buf, bytes);
1182                         kunmap_atomic(kaddr, KM_USER0);
1183                 }
1184                 if (left) {
1185                         kaddr = kmap(page);
1186                         left = __copy_from_user(kaddr + offset, buf, bytes);
1187                         kunmap(page);
1188                 }
1189
1190                 written += bytes;
1191                 count -= bytes;
1192                 pos += bytes;
1193                 buf += bytes;
1194                 if (pos > inode->i_size)
1195                         inode->i_size = pos;
1196
1197                 flush_dcache_page(page);
1198                 set_page_dirty(page);
1199                 if (!PageReferenced(page))
1200                         SetPageReferenced(page);
1201                 page_cache_release(page);
1202
1203                 if (left) {
1204                         pos -= left;
1205                         written -= left;
1206                         err = -EFAULT;
1207                         break;
1208                 }
1209
1210                 /*
1211                  * Our dirty pages are not counted in nr_dirty,
1212                  * and we do not attempt to balance dirty pages.
1213                  */
1214
1215                 cond_resched();
1216         } while (count);
1217
1218         *ppos = pos;
1219         if (written)
1220                 err = written;
1221
1222         /* Short writes give back address space */
1223         if (inode->i_size != maxpos)
1224                 vm_unacct_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size));
1225 out:
1226         up(&inode->i_sem);
1227         return err;
1228 }
1229
1230 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1231 {
1232         struct inode *inode = filp->f_dentry->d_inode;
1233         struct address_space *mapping = inode->i_mapping;
1234         unsigned long index, offset;
1235
1236         index = *ppos >> PAGE_CACHE_SHIFT;
1237         offset = *ppos & ~PAGE_CACHE_MASK;
1238
1239         for (;;) {
1240                 struct page *page = NULL;
1241                 unsigned long end_index, nr, ret;
1242                 loff_t i_size = i_size_read(inode);
1243
1244                 end_index = i_size >> PAGE_CACHE_SHIFT;
1245                 if (index > end_index)
1246                         break;
1247                 if (index == end_index) {
1248                         nr = i_size & ~PAGE_CACHE_MASK;
1249                         if (nr <= offset)
1250                                 break;
1251                 }
1252
1253                 desc->error = shmem_getpage(inode, index, &page, SGP_READ);
1254                 if (desc->error) {
1255                         if (desc->error == -EINVAL)
1256                                 desc->error = 0;
1257                         break;
1258                 }
1259
1260                 /*
1261                  * We must evaluate after, since reads (unlike writes)
1262                  * are called without i_sem protection against truncate
1263                  */
1264                 nr = PAGE_CACHE_SIZE;
1265                 i_size = i_size_read(inode);
1266                 end_index = i_size >> PAGE_CACHE_SHIFT;
1267                 if (index == end_index) {
1268                         nr = i_size & ~PAGE_CACHE_MASK;
1269                         if (nr <= offset) {
1270                                 page_cache_release(page);
1271                                 break;
1272                         }
1273                 }
1274                 nr -= offset;
1275
1276                 if (page != ZERO_PAGE(0)) {
1277                         /*
1278                          * If users can be writing to this page using arbitrary
1279                          * virtual addresses, take care about potential aliasing
1280                          * before reading the page on the kernel side.
1281                          */
1282                         if (!list_empty(&mapping->i_mmap_shared))
1283                                 flush_dcache_page(page);
1284                         /*
1285                          * Mark the page accessed if we read the beginning.
1286                          */
1287                         if (!offset)
1288                                 mark_page_accessed(page);
1289                 }
1290
1291                 /*
1292                  * Ok, we have the page, and it's up-to-date, so
1293                  * now we can copy it to user space...
1294                  *
1295                  * The actor routine returns how many bytes were actually used..
1296                  * NOTE! This may not be the same as how much of a user buffer
1297                  * we filled up (we may be padding etc), so we can only update
1298                  * "pos" here (the actor routine has to update the user buffer
1299                  * pointers and the remaining count).
1300                  */
1301                 ret = actor(desc, page, offset, nr);
1302                 offset += ret;
1303                 index += offset >> PAGE_CACHE_SHIFT;
1304                 offset &= ~PAGE_CACHE_MASK;
1305
1306                 page_cache_release(page);
1307                 if (ret != nr || !desc->count)
1308                         break;
1309
1310                 cond_resched();
1311         }
1312
1313         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1314         update_atime(inode);
1315 }
1316
1317 static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
1318 {
1319         read_descriptor_t desc;
1320
1321         if ((ssize_t) count < 0)
1322                 return -EINVAL;
1323         if (!access_ok(VERIFY_WRITE, buf, count))
1324                 return -EFAULT;
1325         if (!count)
1326                 return 0;
1327
1328         desc.written = 0;
1329         desc.count = count;
1330         desc.buf = buf;
1331         desc.error = 0;
1332
1333         do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1334         if (desc.written)
1335                 return desc.written;
1336         return desc.error;
1337 }
1338
1339 static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1340                          size_t count, read_actor_t actor, void __user *target)
1341 {
1342         read_descriptor_t desc;
1343
1344         if (!count)
1345                 return 0;
1346
1347         desc.written = 0;
1348         desc.count = count;
1349         desc.buf = target;
1350         desc.error = 0;
1351
1352         do_shmem_file_read(in_file, ppos, &desc, actor);
1353         if (desc.written)
1354                 return desc.written;
1355         return desc.error;
1356 }
1357
1358 static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1359 {
1360         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1361
1362         buf->f_type = TMPFS_MAGIC;
1363         buf->f_bsize = PAGE_CACHE_SIZE;
1364         spin_lock(&sbinfo->stat_lock);
1365         buf->f_blocks = sbinfo->max_blocks;
1366         buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1367         buf->f_files = sbinfo->max_inodes;
1368         buf->f_ffree = sbinfo->free_inodes;
1369         spin_unlock(&sbinfo->stat_lock);
1370         buf->f_namelen = NAME_MAX;
1371         return 0;
1372 }
1373
1374 /*
1375  * File creation. Allocate an inode, and we're done..
1376  */
1377 static int
1378 shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1379 {
1380         struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1381         int error = -ENOSPC;
1382
1383         if (inode) {
1384                 dir->i_size += BOGO_DIRENT_SIZE;
1385                 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1386                 d_instantiate(dentry, inode);
1387                 dget(dentry); /* Extra count - pin the dentry in core */
1388                 error = 0;
1389         }
1390         return error;
1391 }
1392
1393 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1394 {
1395         int error;
1396
1397         if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1398                 return error;
1399         dir->i_nlink++;
1400         return 0;
1401 }
1402
1403 static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1404                 struct nameidata *nd)
1405 {
1406         return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1407 }
1408
1409 /*
1410  * Link a file..
1411  */
1412 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1413 {
1414         struct inode *inode = old_dentry->d_inode;
1415
1416         dir->i_size += BOGO_DIRENT_SIZE;
1417         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1418         inode->i_nlink++;
1419         atomic_inc(&inode->i_count);    /* New dentry reference */
1420         dget(dentry);           /* Extra pinning count for the created dentry */
1421         d_instantiate(dentry, inode);
1422         return 0;
1423 }
1424
1425 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1426 {
1427         struct inode *inode = dentry->d_inode;
1428
1429         dir->i_size -= BOGO_DIRENT_SIZE;
1430         inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1431         inode->i_nlink--;
1432         dput(dentry);   /* Undo the count from "create" - this does all the work */
1433         return 0;
1434 }
1435
1436 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1437 {
1438         if (!simple_empty(dentry))
1439                 return -ENOTEMPTY;
1440
1441         dir->i_nlink--;
1442         return shmem_unlink(dir, dentry);
1443 }
1444
1445 /*
1446  * The VFS layer already does all the dentry stuff for rename,
1447  * we just have to decrement the usage count for the target if
1448  * it exists so that the VFS layer correctly free's it when it
1449  * gets overwritten.
1450  */
1451 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1452 {
1453         struct inode *inode = old_dentry->d_inode;
1454         int they_are_dirs = S_ISDIR(inode->i_mode);
1455
1456         if (!simple_empty(new_dentry))
1457                 return -ENOTEMPTY;
1458
1459         if (new_dentry->d_inode) {
1460                 (void) shmem_unlink(new_dir, new_dentry);
1461                 if (they_are_dirs)
1462                         old_dir->i_nlink--;
1463         } else if (they_are_dirs) {
1464                 old_dir->i_nlink--;
1465                 new_dir->i_nlink++;
1466         }
1467
1468         old_dir->i_size -= BOGO_DIRENT_SIZE;
1469         new_dir->i_size += BOGO_DIRENT_SIZE;
1470         old_dir->i_ctime = old_dir->i_mtime =
1471         new_dir->i_ctime = new_dir->i_mtime =
1472         inode->i_ctime = CURRENT_TIME;
1473         return 0;
1474 }
1475
1476 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1477 {
1478         int error;
1479         int len;
1480         struct inode *inode;
1481         struct page *page = NULL;
1482         char *kaddr;
1483         struct shmem_inode_info *info;
1484
1485         len = strlen(symname) + 1;
1486         if (len > PAGE_CACHE_SIZE)
1487                 return -ENAMETOOLONG;
1488
1489         inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1490         if (!inode)
1491                 return -ENOSPC;
1492
1493         info = SHMEM_I(inode);
1494         inode->i_size = len-1;
1495         if (len <= (char *)inode - (char *)info) {
1496                 /* do it inline */
1497                 memcpy(info, symname, len);
1498                 inode->i_op = &shmem_symlink_inline_operations;
1499         } else {
1500                 if (security_vm_enough_memory(VM_ACCT(1))) {
1501                         iput(inode);
1502                         return -ENOMEM;
1503                 }
1504                 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1505                 if (error) {
1506                         vm_unacct_memory(VM_ACCT(1));
1507                         iput(inode);
1508                         return error;
1509                 }
1510                 inode->i_op = &shmem_symlink_inode_operations;
1511                 spin_lock(&shmem_ilock);
1512                 list_add_tail(&info->list, &shmem_inodes);
1513                 spin_unlock(&shmem_ilock);
1514                 kaddr = kmap_atomic(page, KM_USER0);
1515                 memcpy(kaddr, symname, len);
1516                 kunmap_atomic(kaddr, KM_USER0);
1517                 set_page_dirty(page);
1518                 page_cache_release(page);
1519         }
1520         dir->i_size += BOGO_DIRENT_SIZE;
1521         dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1522         d_instantiate(dentry, inode);
1523         dget(dentry);
1524         return 0;
1525 }
1526
1527 static int shmem_readlink_inline(struct dentry *dentry, char __user *buffer, int buflen)
1528 {
1529         return vfs_readlink(dentry, buffer, buflen, (const char *)SHMEM_I(dentry->d_inode));
1530 }
1531
1532 static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1533 {
1534         return vfs_follow_link(nd, (const char *)SHMEM_I(dentry->d_inode));
1535 }
1536
1537 static int shmem_readlink(struct dentry *dentry, char __user *buffer, int buflen)
1538 {
1539         struct page *page = NULL;
1540         int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1541         if (res)
1542                 return res;
1543         res = vfs_readlink(dentry, buffer, buflen, kmap(page));
1544         kunmap(page);
1545         mark_page_accessed(page);
1546         page_cache_release(page);
1547         return res;
1548 }
1549
1550 static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1551 {
1552         struct page *page = NULL;
1553         int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1554         if (res)
1555                 return res;
1556         res = vfs_follow_link(nd, kmap(page));
1557         kunmap(page);
1558         mark_page_accessed(page);
1559         page_cache_release(page);
1560         return res;
1561 }
1562
1563 static struct inode_operations shmem_symlink_inline_operations = {
1564         .readlink       = shmem_readlink_inline,
1565         .follow_link    = shmem_follow_link_inline,
1566 };
1567
1568 static struct inode_operations shmem_symlink_inode_operations = {
1569         .truncate       = shmem_truncate,
1570         .readlink       = shmem_readlink,
1571         .follow_link    = shmem_follow_link,
1572 };
1573
1574 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
1575 {
1576         char *this_char, *value, *rest;
1577
1578         while ((this_char = strsep(&options, ",")) != NULL) {
1579                 if (!*this_char)
1580                         continue;
1581                 if ((value = strchr(this_char,'=')) != NULL) {
1582                         *value++ = 0;
1583                 } else {
1584                         printk(KERN_ERR
1585                             "tmpfs: No value for mount option '%s'\n",
1586                             this_char);
1587                         return 1;
1588                 }
1589
1590                 if (!strcmp(this_char,"size")) {
1591                         unsigned long long size;
1592                         size = memparse(value,&rest);
1593                         if (*rest == '%') {
1594                                 size <<= PAGE_SHIFT;
1595                                 size *= totalram_pages;
1596                                 do_div(size, 100);
1597                                 rest++;
1598                         }
1599                         if (*rest)
1600                                 goto bad_val;
1601                         *blocks = size >> PAGE_CACHE_SHIFT;
1602                 } else if (!strcmp(this_char,"nr_blocks")) {
1603                         *blocks = memparse(value,&rest);
1604                         if (*rest)
1605                                 goto bad_val;
1606                 } else if (!strcmp(this_char,"nr_inodes")) {
1607                         *inodes = memparse(value,&rest);
1608                         if (*rest)
1609                                 goto bad_val;
1610                 } else if (!strcmp(this_char,"mode")) {
1611                         if (!mode)
1612                                 continue;
1613                         *mode = simple_strtoul(value,&rest,8);
1614                         if (*rest)
1615                                 goto bad_val;
1616                 } else if (!strcmp(this_char,"uid")) {
1617                         if (!uid)
1618                                 continue;
1619                         *uid = simple_strtoul(value,&rest,0);
1620                         if (*rest)
1621                                 goto bad_val;
1622                 } else if (!strcmp(this_char,"gid")) {
1623                         if (!gid)
1624                                 continue;
1625                         *gid = simple_strtoul(value,&rest,0);
1626                         if (*rest)
1627                                 goto bad_val;
1628                 } else {
1629                         printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1630                                this_char);
1631                         return 1;
1632                 }
1633         }
1634         return 0;
1635
1636 bad_val:
1637         printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1638                value, this_char);
1639         return 1;
1640
1641 }
1642
1643 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1644 {
1645         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1646         unsigned long max_blocks = sbinfo->max_blocks;
1647         unsigned long max_inodes = sbinfo->max_inodes;
1648
1649         if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
1650                 return -EINVAL;
1651         return shmem_set_size(sbinfo, max_blocks, max_inodes);
1652 }
1653 #endif
1654
1655 static int shmem_fill_super(struct super_block *sb,
1656                             void *data, int silent)
1657 {
1658         struct inode *inode;
1659         struct dentry *root;
1660         unsigned long blocks, inodes;
1661         int mode   = S_IRWXUGO | S_ISVTX;
1662         uid_t uid = current->fsuid;
1663         gid_t gid = current->fsgid;
1664         struct shmem_sb_info *sbinfo;
1665         int err = -ENOMEM;
1666
1667         sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
1668         if (!sbinfo)
1669                 return -ENOMEM;
1670         sb->s_fs_info = sbinfo;
1671         memset(sbinfo, 0, sizeof(struct shmem_sb_info));
1672
1673         /*
1674          * Per default we only allow half of the physical ram per
1675          * tmpfs instance
1676          */
1677         blocks = inodes = totalram_pages / 2;
1678
1679 #ifdef CONFIG_TMPFS
1680         if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes)) {
1681                 err = -EINVAL;
1682                 goto failed;
1683         }
1684 #else
1685         sb->s_flags |= MS_NOUSER;
1686 #endif
1687
1688         spin_lock_init(&sbinfo->stat_lock);
1689         sbinfo->max_blocks = blocks;
1690         sbinfo->free_blocks = blocks;
1691         sbinfo->max_inodes = inodes;
1692         sbinfo->free_inodes = inodes;
1693         sb->s_maxbytes = SHMEM_MAX_BYTES;
1694         sb->s_blocksize = PAGE_CACHE_SIZE;
1695         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1696         sb->s_magic = TMPFS_MAGIC;
1697         sb->s_op = &shmem_ops;
1698         inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1699         if (!inode)
1700                 goto failed;
1701         inode->i_uid = uid;
1702         inode->i_gid = gid;
1703         root = d_alloc_root(inode);
1704         if (!root)
1705                 goto failed_iput;
1706         sb->s_root = root;
1707         return 0;
1708
1709 failed_iput:
1710         iput(inode);
1711 failed:
1712         kfree(sbinfo);
1713         sb->s_fs_info = NULL;
1714         return err;
1715 }
1716
1717 static void shmem_put_super(struct super_block *sb)
1718 {
1719         kfree(sb->s_fs_info);
1720         sb->s_fs_info = NULL;
1721 }
1722
1723 static kmem_cache_t *shmem_inode_cachep;
1724
1725 static struct inode *shmem_alloc_inode(struct super_block *sb)
1726 {
1727         struct shmem_inode_info *p;
1728         p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, SLAB_KERNEL);
1729         if (!p)
1730                 return NULL;
1731         return &p->vfs_inode;
1732 }
1733
1734 static void shmem_destroy_inode(struct inode *inode)
1735 {
1736         kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
1737 }
1738
1739 static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
1740 {
1741         struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
1742
1743         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
1744             SLAB_CTOR_CONSTRUCTOR) {
1745                 inode_init_once(&p->vfs_inode);
1746         }
1747 }
1748
1749 static int init_inodecache(void)
1750 {
1751         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
1752                                              sizeof(struct shmem_inode_info),
1753                                              0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
1754                                              init_once, NULL);
1755         if (shmem_inode_cachep == NULL)
1756                 return -ENOMEM;
1757         return 0;
1758 }
1759
1760 static void destroy_inodecache(void)
1761 {
1762         if (kmem_cache_destroy(shmem_inode_cachep))
1763                 printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
1764 }
1765
1766 static struct address_space_operations shmem_aops = {
1767         .writepage      = shmem_writepage,
1768         .set_page_dirty = __set_page_dirty_nobuffers,
1769 #ifdef CONFIG_TMPFS
1770         .prepare_write  = shmem_prepare_write,
1771         .commit_write   = simple_commit_write,
1772 #endif
1773 };
1774
1775 static struct file_operations shmem_file_operations = {
1776         .mmap           = shmem_mmap,
1777 #ifdef CONFIG_TMPFS
1778         .llseek         = generic_file_llseek,
1779         .read           = shmem_file_read,
1780         .write          = shmem_file_write,
1781         .fsync          = simple_sync_file,
1782         .sendfile       = shmem_file_sendfile,
1783 #endif
1784 };
1785
1786 static struct inode_operations shmem_inode_operations = {
1787         .truncate       = shmem_truncate,
1788         .setattr        = shmem_notify_change,
1789 };
1790
1791 static struct inode_operations shmem_dir_inode_operations = {
1792 #ifdef CONFIG_TMPFS
1793         .create         = shmem_create,
1794         .lookup         = simple_lookup,
1795         .link           = shmem_link,
1796         .unlink         = shmem_unlink,
1797         .symlink        = shmem_symlink,
1798         .mkdir          = shmem_mkdir,
1799         .rmdir          = shmem_rmdir,
1800         .mknod          = shmem_mknod,
1801         .rename         = shmem_rename,
1802 #endif
1803 };
1804
1805 static struct super_operations shmem_ops = {
1806         .alloc_inode    = shmem_alloc_inode,
1807         .destroy_inode  = shmem_destroy_inode,
1808 #ifdef CONFIG_TMPFS
1809         .statfs         = shmem_statfs,
1810         .remount_fs     = shmem_remount_fs,
1811 #endif
1812         .delete_inode   = shmem_delete_inode,
1813         .drop_inode     = generic_delete_inode,
1814         .put_super      = shmem_put_super,
1815 };
1816
1817 static struct vm_operations_struct shmem_vm_ops = {
1818         .nopage         = shmem_nopage,
1819         .populate       = shmem_populate,
1820 };
1821
1822 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
1823         int flags, const char *dev_name, void *data)
1824 {
1825         return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
1826 }
1827
1828 static struct file_system_type tmpfs_fs_type = {
1829         .owner          = THIS_MODULE,
1830         .name           = "tmpfs",
1831         .get_sb         = shmem_get_sb,
1832         .kill_sb        = kill_litter_super,
1833 };
1834 static struct vfsmount *shm_mnt;
1835
1836 static int __init init_tmpfs(void)
1837 {
1838         int error;
1839
1840         error = init_inodecache();
1841         if (error)
1842                 goto out3;
1843
1844         error = register_filesystem(&tmpfs_fs_type);
1845         if (error) {
1846                 printk(KERN_ERR "Could not register tmpfs\n");
1847                 goto out2;
1848         }
1849 #ifdef CONFIG_TMPFS
1850         devfs_mk_dir("shm");
1851 #endif
1852         shm_mnt = kern_mount(&tmpfs_fs_type);
1853         if (IS_ERR(shm_mnt)) {
1854                 error = PTR_ERR(shm_mnt);
1855                 printk(KERN_ERR "Could not kern_mount tmpfs\n");
1856                 goto out1;
1857         }
1858
1859         /* The internal instance should not do size checking */
1860         shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
1861         return 0;
1862
1863 out1:
1864         unregister_filesystem(&tmpfs_fs_type);
1865 out2:
1866         destroy_inodecache();
1867 out3:
1868         shm_mnt = ERR_PTR(error);
1869         return error;
1870 }
1871 module_init(init_tmpfs)
1872
1873 /*
1874  * shmem_file_setup - get an unlinked file living in tmpfs
1875  *
1876  * @name: name for dentry (to be seen in /proc/<pid>/maps
1877  * @size: size to be set for the file
1878  *
1879  */
1880 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
1881 {
1882         int error;
1883         struct file *file;
1884         struct inode *inode;
1885         struct dentry *dentry, *root;
1886         struct qstr this;
1887
1888         if (IS_ERR(shm_mnt))
1889                 return (void *)shm_mnt;
1890
1891         if (size > SHMEM_MAX_BYTES)
1892                 return ERR_PTR(-EINVAL);
1893
1894         if ((flags & VM_ACCOUNT) && security_vm_enough_memory(VM_ACCT(size)))
1895                 return ERR_PTR(-ENOMEM);
1896
1897         error = -ENOMEM;
1898         this.name = name;
1899         this.len = strlen(name);
1900         this.hash = 0; /* will go */
1901         root = shm_mnt->mnt_root;
1902         dentry = d_alloc(root, &this);
1903         if (!dentry)
1904                 goto put_memory;
1905
1906         error = -ENFILE;
1907         file = get_empty_filp();
1908         if (!file)
1909                 goto put_dentry;
1910
1911         error = -ENOSPC;
1912         inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
1913         if (!inode)
1914                 goto close_file;
1915
1916         SHMEM_I(inode)->flags &= flags;
1917         d_instantiate(dentry, inode);
1918         inode->i_size = size;
1919         inode->i_nlink = 0;     /* It is unlinked */
1920         file->f_vfsmnt = mntget(shm_mnt);
1921         file->f_dentry = dentry;
1922         file->f_op = &shmem_file_operations;
1923         file->f_mode = FMODE_WRITE | FMODE_READ;
1924         return(file);
1925
1926 close_file:
1927         put_filp(file);
1928 put_dentry:
1929         dput(dentry);
1930 put_memory:
1931         if (flags & VM_ACCOUNT)
1932                 vm_unacct_memory(VM_ACCT(size));
1933         return ERR_PTR(error);
1934 }
1935
1936 /*
1937  * shmem_zero_setup - setup a shared anonymous mapping
1938  *
1939  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
1940  */
1941 int shmem_zero_setup(struct vm_area_struct *vma)
1942 {
1943         struct file *file;
1944         loff_t size = vma->vm_end - vma->vm_start;
1945
1946         file = shmem_file_setup("dev/zero", size, vma->vm_flags);
1947         if (IS_ERR(file))
1948                 return PTR_ERR(file);
1949
1950         if (vma->vm_file)
1951                 fput(vma->vm_file);
1952         vma->vm_file = file;
1953         vma->vm_ops = &shmem_vm_ops;
1954         return 0;
1955 }
1956
1957 EXPORT_SYMBOL(shmem_file_setup);