ipc/shm.c

   1 /*
   2  * linux/ipc/shm.c
   3  * Copyright (C) 1992, 1993 Krishna Balasubramanian
   4  *       Many improvements/fixes by Bruno Haible.
   5  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
   6  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
   7  *
   8  * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
   9  * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
  10  * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
  11  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  12  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  13  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
  14  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  15  * make it a file system,  Christoph Rohland <hans-christoph.rohland@sap.com>
  16  *
  17  * The filesystem has the following restrictions/bugs:
  18  * 1) It only can handle one directory.
  19  * 2) Private writeable mappings are not supported
  20  * 3) Read and write are not implemented (should they?)
  21  * 4) No special nodes are supported
  22  *
  23  * There are the following mount options:
  24  * - nr_blocks (^= shmall) is the number of blocks of size PAGE_SIZE
  25  *   we are allowed to allocate
  26  * - nr_inodes (^= shmmni) is the number of files we are allowed to
  27  *   allocate
  28  * - mode is the mode for the root directory (default S_IRWXUGO | S_ISVTX)
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/module.h>
  33 #include <linux/malloc.h>
  34 #include <linux/shm.h>
  35 #include <linux/swap.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/init.h>
  38 #include <linux/locks.h>
  39 #include <linux/file.h>
  40 #include <linux/mman.h>
  41 #include <linux/vmalloc.h>
  42 #include <linux/pagemap.h>
  43 #include <linux/proc_fs.h>
  44 #include <linux/highmem.h>
  45
  46 #include <asm/uaccess.h>
  47 #include <asm/pgtable.h>
  48
  49 #include "util.h"
  50
  51 static struct super_block *shm_read_super(struct super_block *,void *, int);
  52 static void           shm_put_super  (struct super_block *);
  53 static int            shm_remount_fs (struct super_block *, int *, char *);
  54 static void           shm_read_inode (struct inode *);
  55 static void           shm_write_inode(struct inode *);
  56 static int            shm_statfs   (struct super_block *, struct statfs *);
  57 static int            shm_create   (struct inode *,struct dentry *,int);
  58 static struct dentry *shm_lookup   (struct inode *,struct dentry *);
  59 static int            shm_unlink   (struct inode *,struct dentry *);
  60 static int            shm_setattr  (struct dentry *dent, struct iattr *attr);
  61 static void           shm_delete   (struct inode *);
  62 static int            shm_mmap     (struct file *, struct vm_area_struct *);
  63 static int            shm_readdir  (struct file *, void *, filldir_t);
  64
  65 #define SHM_NAME_LEN NAME_MAX
  66 #define SHM_FMT ".IPC_%08x"
  67 #define SHM_FMT_LEN 13
  68
  69 /* shm_mode upper byte flags */
  70 /* SHM_DEST and SHM_LOCKED are used in ipcs(8) */
  71 #define PRV_DEST        0010000 /* segment will be destroyed on last detach */
  72 #define PRV_LOCKED      0020000 /* segment will not be swapped */
  73 #define SHM_UNLK        0040000 /* filename is unlinked */
  74 #define SHM_SYSV        0100000 /* It is a SYSV shm segment */
  75
  76 struct shmid_kernel /* private to the kernel */
  77 {
  78         struct kern_ipc_perm    shm_perm;
  79         size_t                  shm_segsz;
  80         unsigned long           shm_nattch;
  81         unsigned long           shm_npages; /* size of segment (pages) */
  82         pte_t                   **shm_dir;  /* ptr to arr of ptrs to frames */
  83         int                     id;
  84         union permap {
  85                 struct shmem {
  86                         time_t                  atime;
  87                         time_t                  dtime;
  88                         time_t                  ctime;
  89                         pid_t                   cpid;
  90                         pid_t                   lpid;
  91                         int                     nlen;
  92                         char                    nm[0];
  93                 } shmem;
  94                 struct zero {
  95                         struct semaphore        sema;
  96                         struct list_head        list;
  97                 } zero;
  98         } permap;
  99 };
 100
 101 #define shm_atim        permap.shmem.atime
 102 #define shm_dtim        permap.shmem.dtime
 103 #define shm_ctim        permap.shmem.ctime
 104 #define shm_cprid       permap.shmem.cpid
 105 #define shm_lprid       permap.shmem.lpid
 106 #define shm_namelen     permap.shmem.nlen
 107 #define shm_name        permap.shmem.nm
 108 #define shm_flags       shm_perm.mode
 109 #define zsem            permap.zero.sema
 110 #define zero_list       permap.zero.list
 111
 112 static struct ipc_ids shm_ids;
 113
 114 #define shm_lock(id)    ((struct shmid_kernel*)ipc_lock(&shm_ids,id))
 115 #define shm_unlock(id)  ipc_unlock(&shm_ids,id)
 116 #define shm_lockall()   ipc_lockall(&shm_ids)
 117 #define shm_unlockall() ipc_unlockall(&shm_ids)
 118 #define shm_get(id)     ((struct shmid_kernel*)ipc_get(&shm_ids,id))
 119 #define shm_buildid(id, seq) \
 120         ipc_buildid(&shm_ids, id, seq)
 121
 122 static int newseg (key_t key, const char *name, int namelen, int shmflg, size_t size);
 123 static void seg_free(struct shmid_kernel *shp, int doacc);
 124 static void shm_open (struct vm_area_struct *shmd);
 125 static void shm_close (struct vm_area_struct *shmd);
 126 static int shm_remove_name(int id);
 127 static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
 128 static int shm_swapout(struct page *, struct file *);
 129 #ifdef CONFIG_PROC_FS
 130 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 131 #endif
 132
 133 static void zshm_swap (int prio, int gfp_mask);
 134 static void zmap_unuse(swp_entry_t entry, struct page *page);
 135 static void shmzero_open(struct vm_area_struct *shmd);
 136 static void shmzero_close(struct vm_area_struct *shmd);
 137 static struct page *shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share);
 138 static int zero_id;
 139 static struct shmid_kernel zshmid_kernel;
 140 static struct dentry *zdent;
 141
 142 #define SHM_FS_MAGIC 0x02011994
 143
 144 static struct super_block * shm_sb;
 145
 146 static DECLARE_FSTYPE(shm_fs_type, "shm", shm_read_super, FS_SINGLE);
 147
 148 static struct super_operations shm_sops = {
 149         read_inode:     shm_read_inode,
 150         write_inode:    shm_write_inode,
 151         delete_inode:   shm_delete,
 152         put_super:      shm_put_super,
 153         statfs:         shm_statfs,
 154         remount_fs:     shm_remount_fs,
 155 };
 156
 157 static struct file_operations shm_root_operations = {
 158         readdir:        shm_readdir,
 159 };
 160
 161 static struct inode_operations shm_root_inode_operations = {
 162         create:         shm_create,
 163         lookup:         shm_lookup,
 164         unlink:         shm_unlink,
 165 };
 166
 167 static struct file_operations shm_file_operations = {
 168         mmap:   shm_mmap,
 169 };
 170
 171 static struct inode_operations shm_inode_operations = {
 172         setattr:        shm_setattr,
 173 };
 174
 175 static struct vm_operations_struct shm_vm_ops = {
 176         open:   shm_open,       /* callback for a new vm-area open */
 177         close:  shm_close,      /* callback for when the vm-area is released */
 178         nopage: shm_nopage,
 179         swapout:shm_swapout,
 180 };
 181
 182 size_t shm_ctlmax = SHMMAX;
 183
 184 /* These parameters should be part of the superblock */
 185 static int shm_ctlall;
 186 static int shm_ctlmni;
 187 static int shm_mode;
 188
 189 static int shm_tot; /* total number of shared memory pages */
 190 static int shm_rss; /* number of shared memory pages that are in memory */
 191 static int shm_swp; /* number of shared memory pages that are in swap */
 192
 193 /* locks order:
 194         pagecache_lock
 195         shm_lock()/shm_lockall()
 196         kernel lock
 197         inode->i_sem
 198         sem_ids.sem
 199         mmap_sem
 200
 201   SMP assumptions:
 202   - swap_free() never sleeps
 203   - add_to_swap_cache() never sleeps
 204   - add_to_swap_cache() doesn't acquire the big kernel lock.
 205   - shm_unuse() is called with the kernel lock acquired.
 206  */
 207
 208 /* some statistics */
 209 static ulong swap_attempts;
 210 static ulong swap_successes;
 211 static ulong used_segs;
 212
 213 void __init shm_init (void)
 214 {
 215         struct vfsmount *res;
 216         ipc_init_ids(&shm_ids, 1);
 217
 218         register_filesystem (&shm_fs_type);
 219         res = kern_mount(&shm_fs_type);
 220         if (IS_ERR(res)) {
 221                 unregister_filesystem(&shm_fs_type);
 222                 return;
 223         }
 224 #ifdef CONFIG_PROC_FS
 225         create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 226 #endif
 227         zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, 1);
 228         shm_unlock(zero_id);
 229         INIT_LIST_HEAD(&zshmid_kernel.zero_list);
 230         zdent = d_alloc_root(get_empty_inode());
 231         return;
 232 }
 233
 234 static int shm_parse_options(char *options)
 235 {
 236         int blocks = shm_ctlall;
 237         int inodes = shm_ctlmni;
 238         umode_t mode = shm_mode;
 239         char *this_char, *value;
 240
 241         this_char = NULL;
 242         if ( options )
 243                 this_char = strtok(options,",");
 244         for ( ; this_char; this_char = strtok(NULL,",")) {
 245                 if ((value = strchr(this_char,'=')) != NULL)
 246                         *value++ = 0;
 247                 if (!strcmp(this_char,"nr_blocks")) {
 248                         if (!value || !*value)
 249                                 return 1;
 250                         blocks = simple_strtoul(value,&value,0);
 251                         if (*value)
 252                                 return 1;
 253                 }
 254                 else if (!strcmp(this_char,"nr_inodes")) {
 255                         if (!value || !*value)
 256                                 return 1;
 257                         inodes = simple_strtoul(value,&value,0);
 258                         if (*value)
 259                                 return 1;
 260                 }
 261                 else if (!strcmp(this_char,"mode")) {
 262                         if (!value || !*value)
 263                                 return 1;
 264                         mode = simple_strtoul(value,&value,8);
 265                         if (*value)
 266                                 return 1;
 267                 }
 268                 else
 269                         return 1;
 270         }
 271         shm_ctlmni = inodes;
 272         shm_ctlall = blocks;
 273         shm_mode   = mode;
 274
 275         return 0;
 276 }
 277
 278 static struct super_block *shm_read_super(struct super_block *s,void *data,
 279                                           int silent)
 280 {
 281         struct inode * root_inode;
 282
 283         shm_ctlall = SHMALL;
 284         shm_ctlmni = SHMMNI;
 285         shm_mode   = S_IRWXUGO | S_ISVTX;
 286         if (shm_parse_options (data)) {
 287                 printk(KERN_ERR "shm fs invalid option\n");
 288                 goto out_unlock;
 289         }
 290
 291         s->s_blocksize = PAGE_SIZE;
 292         s->s_blocksize_bits = PAGE_SHIFT;
 293         s->s_magic = SHM_FS_MAGIC;
 294         s->s_op = &shm_sops;
 295         root_inode = iget (s, SEQ_MULTIPLIER);
 296         if (!root_inode)
 297                 goto out_no_root;
 298         root_inode->i_op = &shm_root_inode_operations;
 299         root_inode->i_sb = s;
 300         root_inode->i_nlink = 2;
 301         root_inode->i_mode = S_IFDIR | shm_mode;
 302         s->s_root = d_alloc_root(root_inode);
 303         if (!s->s_root)
 304                 goto out_no_root;
 305         shm_sb = s;
 306         return s;
 307
 308 out_no_root:
 309         printk(KERN_ERR "proc_read_super: get root inode failed\n");
 310         iput(root_inode);
 311 out_unlock:
 312         return NULL;
 313 }
 314
 315 static int shm_remount_fs (struct super_block *sb, int *flags, char *data)
 316 {
 317         if (shm_parse_options (data))
 318                 return -EINVAL;
 319         return 0;
 320 }
 321
 322 static inline int shm_checkid(struct shmid_kernel *s, int id)
 323 {
 324         if (!(s->shm_flags & SHM_SYSV))
 325                 return -EINVAL;
 326         if (ipc_checkid(&shm_ids,&s->shm_perm,id))
 327                 return -EIDRM;
 328         return 0;
 329 }
 330
 331 static inline struct shmid_kernel *shm_rmid(int id)
 332 {
 333         return (struct shmid_kernel *)ipc_rmid(&shm_ids,id);
 334 }
 335
 336 static inline int shm_addid(struct shmid_kernel *shp)
 337 {
 338         return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);
 339 }
 340
 341 static void shm_put_super(struct super_block *sb)
 342 {
 343         int i;
 344         struct shmid_kernel *shp;
 345
 346         down(&shm_ids.sem);
 347         for(i = 0; i <= shm_ids.max_id; i++) {
 348                 if (i == zero_id)
 349                         continue;
 350                 if (!(shp = shm_lock (i)))
 351                         continue;
 352                 if (shp->shm_nattch)
 353                         printk(KERN_DEBUG "shm_nattch = %ld\n", shp->shm_nattch);
 354                 shp = shm_rmid(i);
 355                 shm_unlock(i);
 356                 seg_free(shp, 1);
 357         }
 358         dput (sb->s_root);
 359         up(&shm_ids.sem);
 360 }
 361
 362 static int shm_statfs(struct super_block *sb, struct statfs *buf)
 363 {
 364         buf->f_type = SHM_FS_MAGIC;
 365         buf->f_bsize = PAGE_SIZE;
 366         buf->f_blocks = shm_ctlall;
 367         buf->f_bavail = buf->f_bfree = shm_ctlall - shm_tot;
 368         buf->f_files = shm_ctlmni;
 369         buf->f_ffree = shm_ctlmni - used_segs;
 370         buf->f_namelen = SHM_NAME_LEN;
 371         return 0;
 372 }
 373
 374 static void shm_write_inode(struct inode * inode)
 375 {
 376 }
 377
 378 static void shm_read_inode(struct inode * inode)
 379 {
 380         int id;
 381         struct shmid_kernel *shp;
 382
 383         id = inode->i_ino;
 384         inode->i_op = NULL;
 385         inode->i_mode = 0;
 386         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 387
 388         if (id < SEQ_MULTIPLIER) {
 389                 if (!(shp = shm_lock (id)))
 390                         return;
 391                 inode->i_mode = (shp->shm_flags & S_IALLUGO) | S_IFREG;
 392                 inode->i_uid  = shp->shm_perm.uid;
 393                 inode->i_gid  = shp->shm_perm.gid;
 394                 inode->i_size = shp->shm_segsz;
 395                 shm_unlock (id);
 396                 inode->i_op  = &shm_inode_operations;
 397                 inode->i_fop = &shm_file_operations;
 398                 return;
 399         }
 400         inode->i_op    = &shm_root_inode_operations;
 401         inode->i_fop   = &shm_root_operations;
 402         inode->i_sb    = shm_sb;
 403         inode->i_nlink = 2;
 404         inode->i_mode  = S_IFDIR | shm_mode;
 405         inode->i_uid   = inode->i_gid = 0;
 406
 407 }
 408
 409 static int shm_create (struct inode *dir, struct dentry *dent, int mode)
 410 {
 411         int id, err;
 412         struct inode * inode;
 413
 414         down(&shm_ids.sem);
 415         err = id = newseg (IPC_PRIVATE, dent->d_name.name, dent->d_name.len, mode, 0);
 416         if (err < 0)
 417                 goto out;
 418
 419         err = -ENOMEM;
 420         inode = iget (shm_sb, id % SEQ_MULTIPLIER);
 421         if (!inode)
 422                 goto out;
 423
 424         err = 0;
 425         down (&inode->i_sem);
 426         inode->i_mode = mode | S_IFREG;
 427         inode->i_op   = &shm_inode_operations;
 428         d_instantiate(dent, inode);
 429         up (&inode->i_sem);
 430
 431 out:
 432         up(&shm_ids.sem);
 433         return err;
 434 }
 435
 436 static int shm_readdir (struct file *filp, void *dirent, filldir_t filldir)
 437 {
 438         struct inode * inode = filp->f_dentry->d_inode;
 439         struct shmid_kernel *shp;
 440         off_t nr;
 441
 442         nr = filp->f_pos;
 443
 444         switch(nr)
 445         {
 446         case 0:
 447                 if (filldir(dirent, ".", 1, nr, inode->i_ino) < 0)
 448                         return 0;
 449                 filp->f_pos = ++nr;
 450                 /* fall through */
 451         case 1:
 452                 if (filldir(dirent, "..", 2, nr, inode->i_ino) < 0)
 453                         return 0;
 454                 filp->f_pos = ++nr;
 455                 /* fall through */
 456         default:
 457                 down(&shm_ids.sem);
 458                 for (; nr-2 <= shm_ids.max_id; nr++ ) {
 459                         if (nr-2 == zero_id)
 460                                 continue;
 461                         if (!(shp = shm_get (nr-2)))
 462                                 continue;
 463                         if (shp->shm_flags & SHM_UNLK)
 464                                 continue;
 465                         if (filldir(dirent, shp->shm_name, shp->shm_namelen, nr, nr) < 0 )
 466                                 break;;
 467                 }
 468                 filp->f_pos = nr;
 469                 up(&shm_ids.sem);
 470                 break;
 471         }
 472
 473         UPDATE_ATIME(inode);
 474         return 0;
 475 }
 476
 477 static struct dentry *shm_lookup (struct inode *dir, struct dentry *dent)
 478 {
 479         int i, err = 0;
 480         struct shmid_kernel* shp;
 481         struct inode *inode = NULL;
 482
 483         if (dent->d_name.len > SHM_NAME_LEN)
 484                 return ERR_PTR(-ENAMETOOLONG);
 485
 486         down(&shm_ids.sem);
 487         for(i = 0; i <= shm_ids.max_id; i++) {
 488                 if (i == zero_id)
 489                         continue;
 490                 if (!(shp = shm_lock(i)))
 491                         continue;
 492                 if (!(shp->shm_flags & SHM_UNLK) &&
 493                     dent->d_name.len == shp->shm_namelen &&
 494                     strncmp(dent->d_name.name, shp->shm_name, shp->shm_namelen) == 0)
 495                         goto found;
 496                 shm_unlock(i);
 497         }
 498
 499         /*
 500          * prevent the reserved names as negative dentries.
 501          * This also prevents object creation through the filesystem
 502          */
 503         if (dent->d_name.len == SHM_FMT_LEN &&
 504             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 505                 err = -EINVAL;  /* EINVAL to give IPC_RMID the right error */
 506
 507         goto out;
 508
 509 found:
 510         shm_unlock(i);
 511         inode = iget(dir->i_sb, i);
 512
 513         if (!inode)
 514                 err = -EACCES;
 515 out:
 516         if (err == 0)
 517                 d_add (dent, inode);
 518         up (&shm_ids.sem);
 519         return ERR_PTR(err);
 520 }
 521
 522 static int shm_unlink (struct inode *dir, struct dentry *dent)
 523 {
 524         struct inode * inode = dent->d_inode;
 525         struct shmid_kernel *shp;
 526
 527         down (&shm_ids.sem);
 528         if (!(shp = shm_lock (inode->i_ino)))
 529                 BUG();
 530         shp->shm_flags |= SHM_UNLK | PRV_DEST;
 531         shp->shm_perm.key = IPC_PRIVATE; /* Do not find it any more */
 532         shm_unlock (inode->i_ino);
 533         up (&shm_ids.sem);
 534         inode->i_nlink -= 1;
 535         /*
 536          * If it's a reserved name we have to drop the dentry instead
 537          * of creating a negative dentry
 538          */
 539         if (dent->d_name.len == SHM_FMT_LEN &&
 540             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 541                 d_drop (dent);
 542         return 0;
 543 }
 544
 545 /*
 546  * We cannot use kmalloc for shm_alloc since this restricts the
 547  * maximum size of the segments.
 548  *
 549  * We also cannot use vmalloc, since this uses too much of the vmalloc
 550  * space and we run out of this on highend machines.
 551  *
 552  * So we have to use this complicated indirect scheme to alloc the shm
 553  * page tables.
 554  *
 555  */
 556
 557 #ifdef PTE_INIT
 558 static inline void init_ptes (pte_t *pte, int number) {
 559         while (number--)
 560                 PTE_INIT (pte++);
 561 }
 562 #else
 563 static inline void init_ptes (pte_t *pte, int number) {
 564         memset (pte, 0, number*sizeof(*pte));
 565 }
 566 #endif
 567
 568 #define PTES_PER_PAGE (PAGE_SIZE/sizeof(pte_t))
 569 #define SHM_ENTRY(shp, index) (shp)->shm_dir[(index)/PTES_PER_PAGE][(index)%PTES_PER_PAGE]
 570
 571 static pte_t **shm_alloc(unsigned long pages, int doacc)
 572 {
 573         unsigned short dir  = pages / PTES_PER_PAGE;
 574         unsigned short last = pages % PTES_PER_PAGE;
 575         pte_t **ret, **ptr;
 576
 577         if (pages == 0)
 578                 return NULL;
 579
 580         ret = kmalloc ((dir+1) * sizeof(pte_t *), GFP_KERNEL);
 581         if (!ret)
 582                 goto nomem;
 583
 584         for (ptr = ret; ptr < ret+dir ; ptr++)
 585         {
 586                 *ptr = (pte_t *)__get_free_page (GFP_KERNEL);
 587                 if (!*ptr)
 588                         goto free;
 589                 init_ptes (*ptr, PTES_PER_PAGE);
 590         }
 591
 592         /* The last one is probably not of PAGE_SIZE: we use kmalloc */
 593         if (last) {
 594                 *ptr = kmalloc (last*sizeof(pte_t), GFP_KERNEL);
 595                 if (!*ptr)
 596                         goto free;
 597                 init_ptes (*ptr, last);
 598         }
 599         if (doacc) {
 600                 shm_lockall();
 601                 shm_tot += pages;
 602                 used_segs++;
 603                 shm_unlockall();
 604         }
 605         return ret;
 606
 607 free:
 608         /* The last failed: we decrement first */
 609         while (--ptr >= ret)
 610                 free_page ((unsigned long)*ptr);
 611
 612         kfree (ret);
 613 nomem:
 614         return ERR_PTR(-ENOMEM);
 615 }
 616
 617 static void shm_free(pte_t** dir, unsigned long pages, int doacc)
 618 {
 619         int i, rss, swp;
 620         pte_t **ptr = dir+pages/PTES_PER_PAGE;
 621
 622         if (!dir)
 623                 return;
 624
 625         for (i = 0, rss = 0, swp = 0; i < pages ; i++) {
 626                 pte_t pte;
 627                 pte = dir[i/PTES_PER_PAGE][i%PTES_PER_PAGE];
 628                 if (pte_none(pte))
 629                         continue;
 630                 if (pte_present(pte)) {
 631                         __free_page (pte_page(pte));
 632                         rss++;
 633                 } else {
 634                         swap_free(pte_to_swp_entry(pte));
 635                         swp++;
 636                 }
 637         }
 638
 639         /* first the last page */
 640         if (pages%PTES_PER_PAGE)
 641                 kfree (*ptr);
 642         /* now the whole pages */
 643         while (--ptr >= dir)
 644                 if (*ptr)
 645                         free_page ((unsigned long)*ptr);
 646
 647         /* Now the indirect block */
 648         kfree (dir);
 649
 650         if (doacc) {
 651                 shm_lockall();
 652                 shm_rss -= rss;
 653                 shm_swp -= swp;
 654                 shm_tot -= pages;
 655                 used_segs--;
 656                 shm_unlockall();
 657         }
 658 }
 659
 660 static  int shm_setattr (struct dentry *dentry, struct iattr *attr)
 661 {
 662         int error;
 663         struct inode *inode = dentry->d_inode;
 664         struct shmid_kernel *shp;
 665         unsigned long new_pages, old_pages;
 666         pte_t **new_dir, **old_dir;
 667
 668         error = inode_change_ok(inode, attr);
 669         if (error)
 670                 return error;
 671         if (!(attr->ia_valid & ATTR_SIZE))
 672                 goto set_attr;
 673         if (attr->ia_size > shm_ctlmax)
 674                 return -EFBIG;
 675
 676         /* We set old_pages and old_dir for easier cleanup */
 677         old_pages = new_pages = (attr->ia_size  + PAGE_SIZE - 1) >> PAGE_SHIFT;
 678         old_dir = new_dir = shm_alloc(new_pages, 1);
 679         if (IS_ERR(new_dir))
 680                 return PTR_ERR(new_dir);
 681
 682         if (!(shp = shm_lock(inode->i_ino)))
 683                 BUG();
 684         error = -ENOSPC;
 685         if (shm_tot - shp->shm_npages >= shm_ctlall)
 686                 goto size_out;
 687         error = 0;
 688         if (shp->shm_segsz == attr->ia_size)
 689                 goto size_out;
 690         /* Now we set them to the real values */
 691         old_dir = shp->shm_dir;
 692         old_pages = shp->shm_npages;
 693         if (old_dir){
 694                 pte_t *swap;
 695                 int i,j;
 696                 i = old_pages < new_pages ? old_pages : new_pages;
 697                 j = i % PTES_PER_PAGE;
 698                 i /= PTES_PER_PAGE;
 699                 if (j)
 700                         memcpy (new_dir[i], old_dir[i], j * sizeof (pte_t));
 701                 while (i--) {
 702                         swap = new_dir[i];
 703                         new_dir[i] = old_dir[i];
 704                         old_dir[i] = swap;
 705                 }
 706         }
 707         shp->shm_dir = new_dir;
 708         shp->shm_npages = new_pages;
 709         shp->shm_segsz = attr->ia_size;
 710 size_out:
 711         shm_unlock(inode->i_ino);
 712         shm_free (old_dir, old_pages, 1);
 713
 714 set_attr:
 715         if (!(shp = shm_lock(inode->i_ino)))
 716                 BUG();
 717         if (attr->ia_valid & ATTR_MODE)
 718                 shp->shm_perm.mode = attr->ia_mode;
 719         if (attr->ia_valid & ATTR_UID)
 720                 shp->shm_perm.uid = attr->ia_uid;
 721         if (attr->ia_valid & ATTR_GID)
 722                 shp->shm_perm.gid = attr->ia_gid;
 723         shm_unlock (inode->i_ino);
 724
 725         inode_setattr(inode, attr);
 726         return error;
 727 }
 728
 729 static struct shmid_kernel *seg_alloc(int numpages, size_t namelen)
 730 {
 731         struct shmid_kernel *shp;
 732         pte_t              **dir;
 733
 734         shp = (struct shmid_kernel *) kmalloc (sizeof (*shp) + namelen, GFP_KERNEL);
 735         if (!shp)
 736                 return ERR_PTR(-ENOMEM);
 737
 738         dir = shm_alloc (numpages, namelen);
 739         if (IS_ERR(dir)) {
 740                 kfree(shp);
 741                 return ERR_PTR(PTR_ERR(dir));
 742         }
 743         shp->shm_dir    = dir;
 744         shp->shm_npages = numpages;
 745         shp->shm_nattch = 0;
 746         shp->shm_namelen = namelen;
 747         return(shp);
 748 }
 749
 750 static void seg_free(struct shmid_kernel *shp, int doacc)
 751 {
 752         shm_free (shp->shm_dir, shp->shm_npages, doacc);
 753         kfree(shp);
 754 }
 755
 756 static int newseg (key_t key, const char *name, int namelen,
 757                    int shmflg, size_t size)
 758 {
 759         struct shmid_kernel *shp;
 760         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 761         int id;
 762
 763         if (namelen > SHM_NAME_LEN)
 764                 return -ENAMETOOLONG;
 765
 766         if (size > shm_ctlmax)
 767                 return -EINVAL;
 768
 769         if (shm_tot + numpages >= shm_ctlall)
 770                 return -ENOSPC;
 771
 772         shp = seg_alloc(numpages, namelen ? namelen : SHM_FMT_LEN + 1);
 773         if (IS_ERR(shp))
 774                 return PTR_ERR(shp);
 775         id = shm_addid(shp);
 776         if(id == -1) {
 777                 seg_free(shp, 1);
 778                 return -ENOSPC;
 779         }
 780         shp->shm_perm.key = key;
 781         shp->shm_flags = (shmflg & S_IRWXUGO);
 782         shp->shm_segsz = size;
 783         shp->shm_cprid = current->pid;
 784         shp->shm_lprid = 0;
 785         shp->shm_atim = shp->shm_dtim = 0;
 786         shp->shm_ctim = CURRENT_TIME;
 787         shp->id = shm_buildid(id,shp->shm_perm.seq);
 788         if (namelen != 0) {
 789                 shp->shm_namelen = namelen;
 790                 memcpy (shp->shm_name, name, namelen);
 791         } else {
 792                 shp->shm_flags |= SHM_SYSV;
 793                 shp->shm_namelen = sprintf (shp->shm_name, SHM_FMT, shp->id);
 794         }
 795         shm_unlock(id);
 796
 797         return shp->id;
 798 }
 799
 800 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
 801 {
 802         struct shmid_kernel *shp;
 803         int err, id = 0;
 804
 805         if (size < SHMMIN)
 806                 return -EINVAL;
 807
 808         down(&shm_ids.sem);
 809         if (key == IPC_PRIVATE) {
 810                 err = newseg(key, NULL, 0, shmflg, size);
 811         } else if ((id = ipc_findkey(&shm_ids,key)) == -1) {
 812                 if (!(shmflg & IPC_CREAT))
 813                         err = -ENOENT;
 814                 else
 815                         err = newseg(key, NULL, 0, shmflg, size);
 816         } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
 817                 err = -EEXIST;
 818         } else {
 819                 shp = shm_lock(id);
 820                 if(shp==NULL)
 821                         BUG();
 822                 if (shp->shm_segsz < size)
 823                         err = -EINVAL;
 824                 else if (ipcperms(&shp->shm_perm, shmflg))
 825                         err = -EACCES;
 826                 else
 827                         err = shm_buildid(id, shp->shm_perm.seq);
 828                 shm_unlock(id);
 829         }
 830         up(&shm_ids.sem);
 831         return err;
 832 }
 833
 834 static void shm_delete (struct inode *ino)
 835 {
 836         int shmid = ino->i_ino;
 837         struct shmid_kernel *shp;
 838
 839         down(&shm_ids.sem);
 840         shp = shm_lock(shmid);
 841         if(shp==NULL) {
 842                 BUG();
 843         }
 844         shp = shm_rmid(shmid);
 845         shm_unlock(shmid);
 846         up(&shm_ids.sem);
 847         seg_free(shp, 1);
 848         clear_inode(ino);
 849 }
 850
 851 static inline unsigned long copy_shmid_to_user(void *buf, struct shmid64_ds *in, int version)
 852 {
 853         switch(version) {
 854         case IPC_64:
 855                 return copy_to_user(buf, in, sizeof(*in));
 856         case IPC_OLD:
 857             {
 858                 struct shmid_ds out;
 859
 860                 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
 861                 out.shm_segsz   = in->shm_segsz;
 862                 out.shm_atime   = in->shm_atime;
 863                 out.shm_dtime   = in->shm_dtime;
 864                 out.shm_ctime   = in->shm_ctime;
 865                 out.shm_cpid    = in->shm_cpid;
 866                 out.shm_lpid    = in->shm_lpid;
 867                 out.shm_nattch  = in->shm_nattch;
 868
 869                 return copy_to_user(buf, &out, sizeof(out));
 870             }
 871         default:
 872                 return -EINVAL;
 873         }
 874 }
 875
 876 struct shm_setbuf {
 877         uid_t   uid;
 878         gid_t   gid;
 879         mode_t  mode;
 880 };
 881
 882 static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void *buf, int version)
 883 {
 884         switch(version) {
 885         case IPC_64:
 886             {
 887                 struct shmid64_ds tbuf;
 888
 889                 if (copy_from_user(&tbuf, buf, sizeof(tbuf)))
 890                         return -EFAULT;
 891
 892                 out->uid        = tbuf.shm_perm.uid;
 893                 out->gid        = tbuf.shm_perm.gid;
 894                 out->mode       = tbuf.shm_flags;
 895
 896                 return 0;
 897             }
 898         case IPC_OLD:
 899             {
 900                 struct shmid_ds tbuf_old;
 901
 902                 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
 903                         return -EFAULT;
 904
 905                 out->uid        = tbuf_old.shm_perm.uid;
 906                 out->gid        = tbuf_old.shm_perm.gid;
 907                 out->mode       = tbuf_old.shm_flags;
 908
 909                 return 0;
 910             }
 911         default:
 912                 return -EINVAL;
 913         }
 914 }
 915
 916 static inline unsigned long copy_shminfo_to_user(void *buf, struct shminfo64 *in, int version)
 917 {
 918         switch(version) {
 919         case IPC_64:
 920                 return copy_to_user(buf, in, sizeof(*in));
 921         case IPC_OLD:
 922             {
 923                 struct shminfo out;
 924
 925                 if(in->shmmax > INT_MAX)
 926                         out.shmmax = INT_MAX;
 927                 else
 928                         out.shmmax = (int)in->shmmax;
 929
 930                 out.shmmin      = in->shmmin;
 931                 out.shmmni      = in->shmmni;
 932                 out.shmseg      = in->shmseg;
 933                 out.shmall      = in->shmall;
 934
 935                 return copy_to_user(buf, &out, sizeof(out));
 936             }
 937         default:
 938                 return -EINVAL;
 939         }
 940 }
 941
 942 asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 943 {
 944         struct shm_setbuf setbuf;
 945         struct shmid_kernel *shp;
 946         int err, version;
 947
 948         if (cmd < 0 || shmid < 0)
 949                 return -EINVAL;
 950
 951         version = ipc_parse_version(&cmd);
 952
 953         switch (cmd) { /* replace with proc interface ? */
 954         case IPC_INFO:
 955         {
 956                 struct shminfo64 shminfo;
 957
 958                 memset(&shminfo,0,sizeof(shminfo));
 959                 shminfo.shmmni = shminfo.shmseg = shm_ctlmni;
 960                 shminfo.shmmax = shm_ctlmax;
 961                 shminfo.shmall = shm_ctlall;
 962
 963                 shminfo.shmmin = SHMMIN;
 964                 if(copy_shminfo_to_user (buf, &shminfo, version))
 965                         return -EFAULT;
 966                 /* reading a integer is always atomic */
 967                 err= shm_ids.max_id;
 968                 if(err<0)
 969                         err = 0;
 970                 return err;
 971         }
 972         case SHM_INFO:
 973         {
 974                 struct shm_info shm_info;
 975
 976                 memset(&shm_info,0,sizeof(shm_info));
 977                 shm_lockall();
 978                 shm_info.used_ids = shm_ids.in_use;
 979                 shm_info.shm_rss = shm_rss;
 980                 shm_info.shm_tot = shm_tot;
 981                 shm_info.shm_swp = shm_swp;
 982                 shm_info.swap_attempts = swap_attempts;
 983                 shm_info.swap_successes = swap_successes;
 984                 err = shm_ids.max_id;
 985                 shm_unlockall();
 986                 if(copy_to_user (buf, &shm_info, sizeof(shm_info)))
 987                         return -EFAULT;
 988
 989                 return err < 0 ? 0 : err;
 990         }
 991         case SHM_STAT:
 992         case IPC_STAT:
 993         {
 994                 struct shmid64_ds tbuf;
 995                 int result;
 996                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
 997                         return -EINVAL;
 998                 memset(&tbuf, 0, sizeof(tbuf));
 999                 shp = shm_lock(shmid);
1000                 if(shp==NULL)
1001                         return -EINVAL;
1002                 if(cmd==SHM_STAT) {
1003                         err = -EINVAL;
1004                         if (!(shp->shm_flags & SHM_SYSV) ||
1005                             shmid > shm_ids.max_id)
1006                                 goto out_unlock;
1007                         result = shm_buildid(shmid, shp->shm_perm.seq);
1008                 } else {
1009                         err = shm_checkid(shp,shmid);
1010                         if(err)
1011                                 goto out_unlock;
1012                         result = 0;
1013                 }
1014                 err=-EACCES;
1015                 if (ipcperms (&shp->shm_perm, S_IRUGO))
1016                         goto out_unlock;
1017                 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
1018                 /* ugly hack to keep binary compatibility for ipcs */
1019                 tbuf.shm_flags &= PRV_DEST | PRV_LOCKED | S_IRWXUGO;
1020                 if (tbuf.shm_flags & PRV_DEST)
1021                         tbuf.shm_flags |= SHM_DEST;
1022                 if (tbuf.shm_flags & PRV_LOCKED)
1023                         tbuf.shm_flags |= SHM_LOCKED;
1024                 tbuf.shm_flags &= SHM_DEST | SHM_LOCKED | S_IRWXUGO;
1025                 tbuf.shm_segsz  = shp->shm_segsz;
1026                 tbuf.shm_atime  = shp->shm_atim;
1027                 tbuf.shm_dtime  = shp->shm_dtim;
1028                 tbuf.shm_ctime  = shp->shm_ctim;
1029                 tbuf.shm_cpid   = shp->shm_cprid;
1030                 tbuf.shm_lpid   = shp->shm_lprid;
1031                 tbuf.shm_nattch = shp->shm_nattch;
1032                 shm_unlock(shmid);
1033                 if(copy_shmid_to_user (buf, &tbuf, version))
1034                         return -EFAULT;
1035                 return result;
1036         }
1037         case SHM_LOCK:
1038         case SHM_UNLOCK:
1039         {
1040 /* Allow superuser to lock segment in memory */
1041 /* Should the pages be faulted in here or leave it to user? */
1042 /* need to determine interaction with current->swappable */
1043                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1044                         return -EINVAL;
1045                 if (!capable(CAP_IPC_LOCK))
1046                         return -EPERM;
1047
1048                 shp = shm_lock(shmid);
1049                 if(shp==NULL)
1050                         return -EINVAL;
1051                 err = shm_checkid(shp,shmid);
1052                 if(err)
1053                         goto out_unlock;
1054                 if(cmd==SHM_LOCK)
1055                         shp->shm_flags |= PRV_LOCKED;
1056                 else
1057                         shp->shm_flags &= ~PRV_LOCKED;
1058                 shm_unlock(shmid);
1059                 return err;
1060         }
1061         case IPC_RMID:
1062         {
1063                 /*
1064                  *      We cannot simply remove the file. The SVID states
1065                  *      that the block remains until the last person
1066                  *      detaches from it, then is deleted. A shmat() on
1067                  *      an RMID segment is legal in older Linux and if
1068                  *      we change it apps break...
1069                  *
1070                  *      Instead we set a destroyed flag, and then blow
1071                  *      the name away when the usage hits zero.
1072                  */
1073                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
1074                         return -EINVAL;
1075                 down(&shm_ids.sem);
1076                 shp = shm_lock(shmid);
1077                 if (shp == NULL) {
1078                         up(&shm_ids.sem);
1079                         return -EINVAL;
1080                 }
1081                 err = shm_checkid(shp, shmid);
1082                 if (err == 0) {
1083                         if (shp->shm_nattch == 0 &&
1084                             !(shp->shm_flags & SHM_UNLK)) {
1085                                 int id=shp->id;
1086                                 shm_unlock(shmid);
1087                                 up(&shm_ids.sem);
1088                                 /*
1089                                  * We can't hold shm_lock here else we
1090                                  * will deadlock in shm_lookup when we
1091                                  * try to recursively grab it.
1092                                  */
1093                                 return shm_remove_name(id);
1094                         }
1095                         shp->shm_flags |= PRV_DEST;
1096                         /* Do not find it any more */
1097                         shp->shm_perm.key = IPC_PRIVATE;
1098                 }
1099                 /* Unlock */
1100                 shm_unlock(shmid);
1101                 up(&shm_ids.sem);
1102                 return err;
1103         }
1104
1105         case IPC_SET:
1106         {
1107                 struct dentry * dentry;
1108                 char name[SHM_FMT_LEN+1];
1109
1110                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1111                         return -EINVAL;
1112
1113                 if(copy_shmid_from_user (&setbuf, buf, version))
1114                         return -EFAULT;
1115                 down(&shm_ids.sem);
1116                 shp = shm_lock(shmid);
1117                 err=-EINVAL;
1118                 if(shp==NULL)
1119                         goto out_up;
1120                 err = shm_checkid(shp,shmid);
1121                 if(err)
1122                         goto out_unlock_up;
1123                 err=-EPERM;
1124                 if (current->euid != shp->shm_perm.uid &&
1125                     current->euid != shp->shm_perm.cuid &&
1126                     !capable(CAP_SYS_ADMIN)) {
1127                         goto out_unlock_up;
1128                 }
1129
1130                 shp->shm_perm.uid = setbuf.uid;
1131                 shp->shm_perm.gid = setbuf.gid;
1132                 shp->shm_flags = (shp->shm_flags & ~S_IRWXUGO)
1133                         | (setbuf.mode & S_IRWXUGO);
1134                 shp->shm_ctim = CURRENT_TIME;
1135                 shm_unlock(shmid);
1136                 up(&shm_ids.sem);
1137
1138                 sprintf (name, SHM_FMT, shmid);
1139                 lock_kernel();
1140                 dentry = lookup_one(name, lock_parent(shm_sb->s_root));
1141                 unlock_dir(shm_sb->s_root);
1142                 err = PTR_ERR(dentry);
1143                 if (IS_ERR(dentry))
1144                         goto bad_dentry;
1145                 err = -ENOENT;
1146                 if (dentry->d_inode) {
1147                         struct inode *ino = dentry->d_inode;
1148                         ino->i_uid   = setbuf.uid;
1149                         ino->i_gid   = setbuf.gid;
1150                         ino->i_mode  = (setbuf.mode & S_IRWXUGO) | (ino->i_mode & ~S_IALLUGO);;
1151                         ino->i_atime = ino->i_mtime = ino->i_ctime = CURRENT_TIME;
1152                         err = 0;
1153                 }
1154                 dput (dentry);
1155         bad_dentry:
1156                 unlock_kernel();
1157                 return err;
1158         }
1159
1160         default:
1161                 return -EINVAL;
1162         }
1163
1164         err = 0;
1165 out_unlock_up:
1166         shm_unlock(shmid);
1167 out_up:
1168         up(&shm_ids.sem);
1169         return err;
1170 out_unlock:
1171         shm_unlock(shmid);
1172         return err;
1173 }
1174
1175 static inline void shm_inc (int id) {
1176         struct shmid_kernel *shp;
1177
1178         if(!(shp = shm_lock(id)))
1179                 BUG();
1180         shp->shm_atim = CURRENT_TIME;
1181         shp->shm_lprid = current->pid;
1182         shp->shm_nattch++;
1183         shm_unlock(id);
1184 }
1185
1186 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
1187 {
1188         if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
1189                 return -EINVAL; /* we cannot do private writable mappings */
1190         UPDATE_ATIME(file->f_dentry->d_inode);
1191         vma->vm_ops = &shm_vm_ops;
1192         shm_inc(file->f_dentry->d_inode->i_ino);
1193         return 0;
1194 }
1195
1196 /*
1197  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1198  */
1199 asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
1200 {
1201         struct shmid_kernel *shp;
1202         unsigned long addr;
1203         struct file * file;
1204         int    err;
1205         unsigned long flags;
1206         unsigned long prot;
1207         unsigned long o_flags;
1208         int acc_mode;
1209         struct dentry *dentry;
1210         char   name[SHM_FMT_LEN+1];
1211
1212         if (!shm_sb || (shmid % SEQ_MULTIPLIER) == zero_id)
1213                 return -EINVAL;
1214
1215         if ((addr = (ulong)shmaddr)) {
1216                 if (addr & (SHMLBA-1)) {
1217                         if (shmflg & SHM_RND)
1218                                 addr &= ~(SHMLBA-1);       /* round down */
1219                         else
1220                                 return -EINVAL;
1221                 }
1222                 flags = MAP_SHARED | MAP_FIXED;
1223         } else
1224                 flags = MAP_SHARED;
1225
1226         if (shmflg & SHM_RDONLY) {
1227                 prot = PROT_READ;
1228                 o_flags = O_RDONLY;
1229                 acc_mode = S_IRUGO;
1230         } else {
1231                 prot = PROT_READ | PROT_WRITE;
1232                 o_flags = O_RDWR;
1233                 acc_mode = S_IRUGO | S_IWUGO;
1234         }
1235
1236         /*
1237          * We cannot rely on the fs check since SYSV IPC does have an
1238          * aditional creator id...
1239          */
1240         shp = shm_lock(shmid);
1241         if(shp==NULL)
1242                 return -EINVAL;
1243         err = ipcperms(&shp->shm_perm, acc_mode);
1244         shm_unlock(shmid);
1245         if (err)
1246                 return -EACCES;
1247
1248         sprintf (name, SHM_FMT, shmid);
1249
1250         lock_kernel();
1251         mntget(shm_fs_type.kern_mnt);
1252         dentry = lookup_one(name, lock_parent(shm_sb->s_root));
1253         unlock_dir(shm_sb->s_root);
1254         err = PTR_ERR(dentry);
1255         if (IS_ERR(dentry))
1256                 goto bad_file;
1257         err = -ENOENT;
1258         if (!dentry->d_inode)
1259                 goto bad_file;
1260         file = dentry_open(dentry, shm_fs_type.kern_mnt, o_flags);
1261         err = PTR_ERR(file);
1262         if (IS_ERR (file))
1263                 goto bad_file1;
1264         down(&current->mm->mmap_sem);
1265         *raddr = do_mmap (file, addr, file->f_dentry->d_inode->i_size,
1266                           prot, flags, 0);
1267         up(&current->mm->mmap_sem);
1268         unlock_kernel();
1269         if (IS_ERR(*raddr))
1270                 err = PTR_ERR(*raddr);
1271         else
1272                 err = 0;
1273         fput (file);
1274         return err;
1275
1276 bad_file1:
1277         dput(dentry);
1278 bad_file:
1279         mntput(shm_fs_type.kern_mnt);
1280         unlock_kernel();
1281         if (err == -ENOENT)
1282                 return -EINVAL;
1283         return err;
1284 }
1285
1286 /* This is called by fork, once for every shm attach. */
1287 static void shm_open (struct vm_area_struct *shmd)
1288 {
1289         shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino);
1290 }
1291
1292 /*
1293  *      Remove a name.
1294  */
1295
1296 static int shm_remove_name(int id)
1297 {
1298         struct dentry *dir;
1299         struct dentry *dentry;
1300         int error;
1301         char name[SHM_FMT_LEN+1];
1302
1303         sprintf (name, SHM_FMT, id);
1304         lock_kernel();
1305         dir = lock_parent(shm_sb->s_root);
1306         dentry = lookup_one(name, dir);
1307         error = PTR_ERR(dentry);
1308         if (!IS_ERR(dentry)) {
1309                 /*
1310                  * We have to do our own unlink to prevent the vfs
1311                  * permission check. The SYSV IPC layer has already
1312                  * checked the permissions which do not comply to the
1313                  * vfs rules.
1314                  */
1315                 struct inode *inode = dir->d_inode;
1316                 down(&inode->i_zombie);
1317                 error = shm_unlink(inode, dentry);
1318                 if (!error)
1319                         d_delete(dentry);
1320                 up(&inode->i_zombie);
1321                 dput(dentry);
1322         }
1323         unlock_dir(dir);
1324         unlock_kernel();
1325         return error;
1326 }
1327
1328 /*
1329  * remove the attach descriptor shmd.
1330  * free memory for segment if it is marked destroyed.
1331  * The descriptor has already been removed from the current->mm->mmap list
1332  * and will later be kfree()d.
1333  */
1334 static void shm_close (struct vm_area_struct *shmd)
1335 {
1336         int id = shmd->vm_file->f_dentry->d_inode->i_ino;
1337         struct shmid_kernel *shp;
1338
1339         /* remove from the list of attaches of the shm segment */
1340         if(!(shp = shm_lock(id)))
1341                 BUG();
1342         shp->shm_lprid = current->pid;
1343         shp->shm_dtim = CURRENT_TIME;
1344         shp->shm_nattch--;
1345         if(shp->shm_nattch == 0 &&
1346            shp->shm_flags & PRV_DEST &&
1347            !(shp->shm_flags & SHM_UNLK)) {
1348                 int pid=shp->id;
1349                 int err;
1350                 shm_unlock(id);
1351
1352                 /* The kernel lock prevents new attaches from
1353                  * being happening.  We can't hold shm_lock here
1354                  * else we will deadlock in shm_lookup when we
1355                  * try to recursively grab it.
1356                  */
1357                 err = shm_remove_name(pid);
1358                 if(err && err != -EINVAL && err != -ENOENT)
1359                         printk(KERN_ERR "Unlink of SHM id %d failed (%d).\n", pid, err);
1360
1361         } else {
1362                 shm_unlock(id);
1363         }
1364 }
1365
1366 /*
1367  * detach and kill segment if marked destroyed.
1368  * The work is done in shm_close.
1369  */
1370 asmlinkage long sys_shmdt (char *shmaddr)
1371 {
1372         struct mm_struct *mm = current->mm;
1373         struct vm_area_struct *shmd, *shmdnext;
1374
1375         down(&mm->mmap_sem);
1376         for (shmd = mm->mmap; shmd; shmd = shmdnext) {
1377                 shmdnext = shmd->vm_next;
1378                 if (shmd->vm_ops == &shm_vm_ops
1379                     && shmd->vm_start - (shmd->vm_pgoff << PAGE_SHIFT) == (ulong) shmaddr)
1380                         do_munmap(mm, shmd->vm_start, shmd->vm_end - shmd->vm_start);
1381         }
1382         up(&mm->mmap_sem);
1383         return 0;
1384 }
1385
1386 /*
1387  * Enter the shm page into the SHM data structures.
1388  *
1389  * The way "nopage" is done, we don't actually have to
1390  * do anything here: nopage will have filled in the shm
1391  * data structures already, and shm_swap_out() will just
1392  * work off them..
1393  */
1394 static int shm_swapout(struct page * page, struct file *file)
1395 {
1396         return 0;
1397 }
1398
1399 /*
1400  * page not present ... go through shm_dir
1401  */
1402 static struct page * shm_nopage_core(struct shmid_kernel *shp, unsigned int idx, int *swp, int *rss, unsigned long address)
1403 {
1404         pte_t pte;
1405         struct page * page;
1406
1407         if (idx >= shp->shm_npages)
1408                 return NOPAGE_SIGBUS;
1409
1410         pte = SHM_ENTRY(shp,idx);
1411         if (!pte_present(pte)) {
1412                 /* page not present so shm_swap can't race with us
1413                    and the semaphore protects us by other tasks that
1414                    could potentially fault on our pte under us */
1415                 if (pte_none(pte)) {
1416                         shm_unlock(shp->id);
1417                         page = page_cache_alloc();
1418                         if (!page)
1419                                 goto oom;
1420                         clear_user_highpage(page, address);
1421                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1422                                 BUG();
1423                 } else {
1424                         swp_entry_t entry = pte_to_swp_entry(pte);
1425
1426                         shm_unlock(shp->id);
1427                         page = lookup_swap_cache(entry);
1428                         if (!page) {
1429                                 lock_kernel();
1430                                 swapin_readahead(entry);
1431                                 page = read_swap_cache(entry);
1432                                 unlock_kernel();
1433                                 if (!page)
1434                                         goto oom;
1435                         }
1436                         delete_from_swap_cache(page);
1437                         page = replace_with_highmem(page);
1438                         swap_free(entry);
1439                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1440                                 BUG();
1441                         (*swp)--;
1442                 }
1443                 (*rss)++;
1444                 pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1445                 SHM_ENTRY(shp, idx) = pte;
1446         }
1447
1448         /* pte_val(pte) == SHM_ENTRY (shp, idx) */
1449         page_cache_get(pte_page(pte));
1450         return pte_page(pte);
1451
1452 oom:
1453         shm_lock(shp->id);
1454         return NOPAGE_OOM;
1455 }
1456
1457 static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1458 {
1459         struct page * page;
1460         struct shmid_kernel *shp;
1461         unsigned int idx;
1462         struct inode * inode = shmd->vm_file->f_dentry->d_inode;
1463
1464         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1465         idx += shmd->vm_pgoff;
1466
1467         down(&inode->i_sem);
1468         if(!(shp = shm_lock(inode->i_ino)))
1469                 BUG();
1470         page = shm_nopage_core(shp, idx, &shm_swp, &shm_rss, address);
1471         shm_unlock(inode->i_ino);
1472         up(&inode->i_sem);
1473         return(page);
1474 }
1475
1476 #define OKAY    0
1477 #define RETRY   1
1478 #define FAILED  2
1479
1480 static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage)
1481 {
1482         pte_t page;
1483         struct page *page_map;
1484
1485         page = SHM_ENTRY(shp, idx);
1486         if (!pte_present(page))
1487                 return RETRY;
1488         page_map = pte_page(page);
1489         if (page_map->zone->free_pages > page_map->zone->pages_high)
1490                 return RETRY;
1491         if (shp->id != zero_id) swap_attempts++;
1492
1493         if (--counter < 0) /* failed */
1494                 return FAILED;
1495         if (page_count(page_map) != 1)
1496                 return RETRY;
1497
1498         lock_page(page_map);
1499         if (!(page_map = prepare_highmem_swapout(page_map)))
1500                 return FAILED;
1501         SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
1502
1503         /* add the locked page to the swap cache before allowing
1504            the swapin path to run lookup_swap_cache(). This avoids
1505            reading a not yet uptodate block from disk.
1506            NOTE: we just accounted the swap space reference for this
1507            swap cache page at __get_swap_page() time. */
1508         add_to_swap_cache(*outpage = page_map, swap_entry);
1509         return OKAY;
1510 }
1511
1512 static void shm_swap_postop(struct page *page)
1513 {
1514         lock_kernel();
1515         rw_swap_page(WRITE, page, 0);
1516         unlock_kernel();
1517         page_cache_release(page);
1518 }
1519
1520 static int shm_swap_preop(swp_entry_t *swap_entry)
1521 {
1522         lock_kernel();
1523         /* subtle: preload the swap count for the swap cache. We can't
1524            increase the count inside the critical section as we can't release
1525            the shm_lock there. And we can't acquire the big lock with the
1526            shm_lock held (otherwise we would deadlock too easily). */
1527         *swap_entry = __get_swap_page(2);
1528         if (!(*swap_entry).val) {
1529                 unlock_kernel();
1530                 return 1;
1531         }
1532         unlock_kernel();
1533         return 0;
1534 }
1535
1536 /*
1537  * Goes through counter = (shm_rss / (prio + 1)) present shm pages.
1538  */
1539 static unsigned long swap_id; /* currently being swapped */
1540 static unsigned long swap_idx; /* next to swap */
1541
1542 int shm_swap (int prio, int gfp_mask)
1543 {
1544         struct shmid_kernel *shp;
1545         swp_entry_t swap_entry;
1546         unsigned long id, idx;
1547         int loop = 0;
1548         int counter;
1549         struct page * page_map;
1550
1551         zshm_swap(prio, gfp_mask);
1552         counter = shm_rss / (prio + 1);
1553         if (!counter)
1554                 return 0;
1555         if (shm_swap_preop(&swap_entry))
1556                 return 0;
1557
1558         shm_lockall();
1559 check_id:
1560         shp = shm_get(swap_id);
1561         if(shp==NULL || shp->shm_flags & PRV_LOCKED) {
1562 next_id:
1563                 swap_idx = 0;
1564                 if (++swap_id > shm_ids.max_id) {
1565                         swap_id = 0;
1566                         if (loop) {
1567 failed:
1568                                 shm_unlockall();
1569                                 __swap_free(swap_entry, 2);
1570                                 return 0;
1571                         }
1572                         loop = 1;
1573                 }
1574                 goto check_id;
1575         }
1576         id = swap_id;
1577
1578 check_table:
1579         idx = swap_idx++;
1580         if (idx >= shp->shm_npages)
1581                 goto next_id;
1582
1583         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1584                 case RETRY: goto check_table;
1585                 case FAILED: goto failed;
1586         }
1587         swap_successes++;
1588         shm_swp++;
1589         shm_rss--;
1590         shm_unlockall();
1591
1592         shm_swap_postop(page_map);
1593         return 1;
1594 }
1595
1596 /*
1597  * Free the swap entry and set the new pte for the shm page.
1598  */
1599 static void shm_unuse_page(struct shmid_kernel *shp, unsigned long idx,
1600                            swp_entry_t entry, struct page *page)
1601 {
1602         pte_t pte;
1603
1604         pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1605         SHM_ENTRY(shp, idx) = pte;
1606         page_cache_get(page);
1607         shm_rss++;
1608
1609         shm_swp--;
1610
1611         swap_free(entry);
1612 }
1613
1614 static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
1615 {
1616         int n;
1617
1618         for (n = 0; n < shp->shm_npages; n++) {
1619                 if (pte_none(SHM_ENTRY(shp,n)))
1620                         continue;
1621                 if (pte_present(SHM_ENTRY(shp,n)))
1622                         continue;
1623                 if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
1624                         shm_unuse_page(shp, n, entry, page);
1625                         return 1;
1626                 }
1627         }
1628         return 0;
1629 }
1630
1631 /*
1632  * unuse_shm() search for an eventually swapped out shm page.
1633  */
1634 void shm_unuse(swp_entry_t entry, struct page *page)
1635 {
1636         int i;
1637
1638         shm_lockall();
1639         for (i = 0; i <= shm_ids.max_id; i++) {
1640                 struct shmid_kernel *shp = shm_get(i);
1641                 if(shp==NULL)
1642                         continue;
1643                 if (shm_unuse_core(shp, entry, page))
1644                         goto out;
1645         }
1646 out:
1647         shm_unlockall();
1648         zmap_unuse(entry, page);
1649 }
1650
1651 #ifdef CONFIG_PROC_FS
1652 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
1653 {
1654         off_t pos = 0;
1655         off_t begin = 0;
1656         int i, len = 0;
1657
1658         down(&shm_ids.sem);
1659         len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime name\n");
1660
1661         for(i = 0; i <= shm_ids.max_id; i++) {
1662                 struct shmid_kernel* shp;
1663
1664                 if (i == zero_id)
1665                         continue;
1666                 shp = shm_lock(i);
1667                 if(shp!=NULL) {
1668 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1669 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1670                         char *format;
1671
1672                         if (sizeof(size_t) <= sizeof(int))
1673                                 format = SMALL_STRING;
1674                         else
1675                                 format = BIG_STRING;
1676                         len += sprintf(buffer + len, format,
1677                                 shp->shm_perm.key,
1678                                 shm_buildid(i, shp->shm_perm.seq),
1679                                 shp->shm_flags,
1680                                 shp->shm_segsz,
1681                                 shp->shm_cprid,
1682                                 shp->shm_lprid,
1683                                 shp->shm_nattch,
1684                                 shp->shm_perm.uid,
1685                                 shp->shm_perm.gid,
1686                                 shp->shm_perm.cuid,
1687                                 shp->shm_perm.cgid,
1688                                 shp->shm_atim,
1689                                 shp->shm_dtim,
1690                                 shp->shm_ctim,
1691                                 shp->shm_namelen,
1692                                 shp->shm_name,
1693                                 shp->shm_flags & SHM_UNLK ? " (deleted)" : "");
1694                         shm_unlock(i);
1695
1696                         pos += len;
1697                         if(pos < offset) {
1698                                 len = 0;
1699                                 begin = pos;
1700                         }
1701                         if(pos > offset + length)
1702                                 goto done;
1703                 }
1704         }
1705         *eof = 1;
1706 done:
1707         up(&shm_ids.sem);
1708         *start = buffer + (offset - begin);
1709         len -= (offset - begin);
1710         if(len > length)
1711                 len = length;
1712         if(len < 0)
1713                 len = 0;
1714         return len;
1715 }
1716 #endif
1717
1718 #define VMA_TO_SHP(vma)         ((vma)->vm_file->private_data)
1719
1720 static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
1721 static unsigned long zswap_idx; /* next to swap */
1722 static struct shmid_kernel *zswap_shp = &zshmid_kernel;
1723 static int zshm_rss;
1724
1725 static struct vm_operations_struct shmzero_vm_ops = {
1726         open:           shmzero_open,
1727         close:          shmzero_close,
1728         nopage:         shmzero_nopage,
1729         swapout:        shm_swapout,
1730 };
1731
1732 /*
1733  * In this implementation, the "unuse" and "swapout" interfaces are
1734  * interlocked out via the kernel_lock, as well as shm_lock(zero_id).
1735  * "unuse" and "nopage/swapin", as well as "swapout" and "nopage/swapin"
1736  * interlock via shm_lock(zero_id). All these interlocks can be based
1737  * on a per mapping lock instead of being a global lock.
1738  */
1739 /*
1740  * Reference (existance) counting on the file/dentry/inode is done
1741  * by generic vm_file code. The zero code does not hold any reference
1742  * on the pseudo-file. This is possible because the open/close calls
1743  * are bracketed by the file count update calls.
1744  */
1745 static struct file *file_setup(struct file *fzero, struct shmid_kernel *shp)
1746 {
1747         struct file *filp;
1748         struct inode *inp;
1749
1750         if ((filp = get_empty_filp()) == 0)
1751                 return(filp);
1752         if ((inp = get_empty_inode()) == 0) {
1753                 put_filp(filp);
1754                 return(0);
1755         }
1756         if ((filp->f_dentry = d_alloc(zdent, &(const struct qstr) { "dev/zero",
1757                                 8, 0 })) == 0) {
1758                 iput(inp);
1759                 put_filp(filp);
1760                 return(0);
1761         }
1762         filp->f_vfsmnt = mntget(shm_fs_type.kern_mnt);
1763         d_instantiate(filp->f_dentry, inp);
1764
1765         /*
1766          * Copy over dev/ino for benefit of procfs. Use
1767          * ino to indicate seperate mappings.
1768          */
1769         filp->f_dentry->d_inode->i_dev = shm_fs_type.kern_mnt->mnt_sb->s_dev;
1770         filp->f_dentry->d_inode->i_ino = (unsigned long)shp;
1771         if (fzero)
1772                 fput(fzero);    /* release /dev/zero file */
1773         return(filp);
1774 }
1775
1776 int map_zero_setup(struct vm_area_struct *vma)
1777 {
1778         extern int vm_enough_memory(long pages);
1779         struct shmid_kernel *shp;
1780         struct file *filp;
1781
1782         if (!vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))
1783                 return -ENOMEM;
1784         if (IS_ERR(shp = seg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE, 0)))
1785                 return PTR_ERR(shp);
1786         if ((filp = file_setup(vma->vm_file, shp)) == 0) {
1787                 seg_free(shp, 0);
1788                 return -ENOMEM;
1789         }
1790         vma->vm_file = filp;
1791         VMA_TO_SHP(vma) = (void *)shp;
1792         shp->id = zero_id;
1793         init_MUTEX(&shp->zsem);
1794         vma->vm_ops = &shmzero_vm_ops;
1795         shmzero_open(vma);
1796         spin_lock(&zmap_list_lock);
1797         list_add(&shp->zero_list, &zshmid_kernel.zero_list);
1798         spin_unlock(&zmap_list_lock);
1799         return 0;
1800 }
1801
1802 static void shmzero_open(struct vm_area_struct *shmd)
1803 {
1804         struct shmid_kernel *shp;
1805
1806         shp = VMA_TO_SHP(shmd);
1807         down(&shp->zsem);
1808         shp->shm_nattch++;
1809         up(&shp->zsem);
1810 }
1811
1812 static void shmzero_close(struct vm_area_struct *shmd)
1813 {
1814         int done = 0;
1815         struct shmid_kernel *shp;
1816
1817         shp = VMA_TO_SHP(shmd);
1818         down(&shp->zsem);
1819         if (--shp->shm_nattch == 0)
1820                 done = 1;
1821         up(&shp->zsem);
1822         if (done) {
1823                 spin_lock(&zmap_list_lock);
1824                 if (shp == zswap_shp)
1825                         zswap_shp = list_entry(zswap_shp->zero_list.next,
1826                                                 struct shmid_kernel, zero_list);
1827                 list_del(&shp->zero_list);
1828                 spin_unlock(&zmap_list_lock);
1829                 seg_free(shp, 0);
1830         }
1831 }
1832
1833 static struct page * shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1834 {
1835         struct page *page;
1836         struct shmid_kernel *shp;
1837         unsigned int idx;
1838         int dummy;
1839
1840         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1841         idx += shmd->vm_pgoff;
1842
1843         shp = VMA_TO_SHP(shmd);
1844         down(&shp->zsem);
1845         shm_lock(zero_id);
1846         page = shm_nopage_core(shp, idx, &dummy, &zshm_rss, address);
1847         shm_unlock(zero_id);
1848         up(&shp->zsem);
1849         return(page);
1850 }
1851
1852 static void zmap_unuse(swp_entry_t entry, struct page *page)
1853 {
1854         struct shmid_kernel *shp;
1855
1856         spin_lock(&zmap_list_lock);
1857         shm_lock(zero_id);
1858         for (shp = list_entry(zshmid_kernel.zero_list.next, struct shmid_kernel,
1859                         zero_list); shp != &zshmid_kernel;
1860                         shp = list_entry(shp->zero_list.next, struct shmid_kernel,
1861                                                                 zero_list)) {
1862                 if (shm_unuse_core(shp, entry, page))
1863                         break;
1864         }
1865         shm_unlock(zero_id);
1866         spin_unlock(&zmap_list_lock);
1867 }
1868
1869 static void zshm_swap (int prio, int gfp_mask)
1870 {
1871         struct shmid_kernel *shp;
1872         swp_entry_t swap_entry;
1873         unsigned long idx;
1874         int loop = 0;
1875         int counter;
1876         struct page * page_map;
1877
1878         counter = zshm_rss / (prio + 1);
1879         if (!counter)
1880                 return;
1881 next:
1882         if (shm_swap_preop(&swap_entry))
1883                 return;
1884
1885         spin_lock(&zmap_list_lock);
1886         shm_lock(zero_id);
1887         if (zshmid_kernel.zero_list.next == 0)
1888                 goto failed;
1889 next_id:
1890         if (zswap_shp == &zshmid_kernel) {
1891                 if (loop) {
1892 failed:
1893                         shm_unlock(zero_id);
1894                         spin_unlock(&zmap_list_lock);
1895                         __swap_free(swap_entry, 2);
1896                         return;
1897                 }
1898                 zswap_shp = list_entry(zshmid_kernel.zero_list.next,
1899                                         struct shmid_kernel, zero_list);
1900                 zswap_idx = 0;
1901                 loop = 1;
1902         }
1903         shp = zswap_shp;
1904
1905 check_table:
1906         idx = zswap_idx++;
1907         if (idx >= shp->shm_npages) {
1908                 zswap_shp = list_entry(zswap_shp->zero_list.next,
1909                                         struct shmid_kernel, zero_list);
1910                 zswap_idx = 0;
1911                 goto next_id;
1912         }
1913
1914         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1915                 case RETRY: goto check_table;
1916                 case FAILED: goto failed;
1917         }
1918         shm_unlock(zero_id);
1919         spin_unlock(&zmap_list_lock);
1920
1921         shm_swap_postop(page_map);
1922         if (counter)
1923                 goto next;
1924         return;
1925 }
1926