ipc/shm.c

   1 /*
   2  * linux/ipc/shm.c
   3  * Copyright (C) 1992, 1993 Krishna Balasubramanian
   4  *       Many improvements/fixes by Bruno Haible.
   5  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
   6  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
   7  *
   8  * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
   9  * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
  10  * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
  11  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  12  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  13  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
  14  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  15  * make it a file system,  Christoph Rohland <hans-christoph.rohland@sap.com>
  16  *
  17  * The filesystem has the following restrictions/bugs:
  18  * 1) It only can handle one directory.
  19  * 2) Private writeable mappings are not supported
  20  * 3) Read and write are not implemented (should they?)
  21  * 4) No special nodes are supported
  22  *
  23  * There are the following mount options:
  24  * - nr_blocks (^= shmall) is the number of blocks of size PAGE_SIZE
  25  *   we are allowed to allocate
  26  * - nr_inodes (^= shmmni) is the number of files we are allowed to
  27  *   allocate
  28  * - mode is the mode for the root directory (default S_IRWXUGO | S_ISVTX)
  29  */
  30
  31 #include <linux/config.h>
  32 #include <linux/module.h>
  33 #include <linux/malloc.h>
  34 #include <linux/shm.h>
  35 #include <linux/swap.h>
  36 #include <linux/smp_lock.h>
  37 #include <linux/init.h>
  38 #include <linux/locks.h>
  39 #include <linux/file.h>
  40 #include <linux/mman.h>
  41 #include <linux/vmalloc.h>
  42 #include <linux/pagemap.h>
  43 #include <linux/proc_fs.h>
  44 #include <linux/highmem.h>
  45
  46 #include <asm/uaccess.h>
  47 #include <asm/pgtable.h>
  48
  49 #include "util.h"
  50
  51 static struct super_block *shm_read_super(struct super_block *,void *, int);
  52 static void           shm_put_super  (struct super_block *);
  53 static int            shm_remount_fs (struct super_block *, int *, char *);
  54 static void           shm_read_inode (struct inode *);
  55 static int            shm_statfs   (struct super_block *, struct statfs *);
  56 static int            shm_create   (struct inode *,struct dentry *,int);
  57 static struct dentry *shm_lookup   (struct inode *,struct dentry *);
  58 static int            shm_unlink   (struct inode *,struct dentry *);
  59 static int            shm_setattr  (struct dentry *dent, struct iattr *attr);
  60 static void           shm_delete   (struct inode *);
  61 static int            shm_mmap     (struct file *, struct vm_area_struct *);
  62 static int            shm_readdir  (struct file *, void *, filldir_t);
  63
  64 #define SHM_NAME_LEN NAME_MAX
  65 #define SHM_FMT ".IPC_%08x"
  66 #define SHM_FMT_LEN 13
  67
  68 /* shm_mode upper byte flags */
  69 /* SHM_DEST and SHM_LOCKED are used in ipcs(8) */
  70 #define PRV_DEST        0010000 /* segment will be destroyed on last detach */
  71 #define PRV_LOCKED      0020000 /* segment will not be swapped */
  72 #define SHM_UNLK        0040000 /* filename is unlinked */
  73 #define SHM_SYSV        0100000 /* It is a SYSV shm segment */
  74
  75 struct shmid_kernel /* private to the kernel */
  76 {
  77         struct kern_ipc_perm    shm_perm;
  78         size_t                  shm_segsz;
  79         unsigned long           shm_nattch;
  80         unsigned long           shm_npages; /* size of segment (pages) */
  81         pte_t                   **shm_dir;  /* ptr to arr of ptrs to frames */
  82         int                     id;
  83         union permap {
  84                 struct shmem {
  85                         time_t                  atime;
  86                         time_t                  dtime;
  87                         time_t                  ctime;
  88                         pid_t                   cpid;
  89                         pid_t                   lpid;
  90                         int                     nlen;
  91                         char                    nm[0];
  92                 } shmem;
  93                 struct zero {
  94                         struct semaphore        sema;
  95                         struct list_head        list;
  96                 } zero;
  97         } permap;
  98 };
  99
 100 #define shm_atim        permap.shmem.atime
 101 #define shm_dtim        permap.shmem.dtime
 102 #define shm_ctim        permap.shmem.ctime
 103 #define shm_cprid       permap.shmem.cpid
 104 #define shm_lprid       permap.shmem.lpid
 105 #define shm_namelen     permap.shmem.nlen
 106 #define shm_name        permap.shmem.nm
 107 #define shm_flags       shm_perm.mode
 108 #define zsem            permap.zero.sema
 109 #define zero_list       permap.zero.list
 110
 111 static struct ipc_ids shm_ids;
 112
 113 #define shm_lock(id)    ((struct shmid_kernel*)ipc_lock(&shm_ids,id))
 114 #define shm_unlock(id)  ipc_unlock(&shm_ids,id)
 115 #define shm_lockall()   ipc_lockall(&shm_ids)
 116 #define shm_unlockall() ipc_unlockall(&shm_ids)
 117 #define shm_get(id)     ((struct shmid_kernel*)ipc_get(&shm_ids,id))
 118 #define shm_buildid(id, seq) \
 119         ipc_buildid(&shm_ids, id, seq)
 120
 121 static int newseg (key_t key, const char *name, int namelen, int shmflg, size_t size);
 122 static void seg_free(struct shmid_kernel *shp, int doacc);
 123 static void shm_open (struct vm_area_struct *shmd);
 124 static void shm_close (struct vm_area_struct *shmd);
 125 static int shm_remove_name(int id);
 126 static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
 127 static int shm_swapout(struct page *, struct file *);
 128 #ifdef CONFIG_PROC_FS
 129 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 130 #endif
 131
 132 static void zshm_swap (int prio, int gfp_mask);
 133 static void zmap_unuse(swp_entry_t entry, struct page *page);
 134 static void shmzero_open(struct vm_area_struct *shmd);
 135 static void shmzero_close(struct vm_area_struct *shmd);
 136 static struct page *shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share);
 137 static int zero_id;
 138 static struct shmid_kernel zshmid_kernel;
 139 static struct dentry *zdent;
 140
 141 #define SHM_FS_MAGIC 0x02011994
 142
 143 static struct super_block * shm_sb;
 144
 145 static DECLARE_FSTYPE(shm_fs_type, "shm", shm_read_super, FS_SINGLE);
 146
 147 static struct super_operations shm_sops = {
 148         read_inode:     shm_read_inode,
 149         delete_inode:   shm_delete,
 150         put_super:      shm_put_super,
 151         statfs:         shm_statfs,
 152         remount_fs:     shm_remount_fs,
 153 };
 154
 155 static struct file_operations shm_root_operations = {
 156         readdir:        shm_readdir,
 157 };
 158
 159 static struct inode_operations shm_root_inode_operations = {
 160         create:         shm_create,
 161         lookup:         shm_lookup,
 162         unlink:         shm_unlink,
 163 };
 164
 165 static struct file_operations shm_file_operations = {
 166         mmap:   shm_mmap,
 167 };
 168
 169 static struct inode_operations shm_inode_operations = {
 170         setattr:        shm_setattr,
 171 };
 172
 173 static struct vm_operations_struct shm_vm_ops = {
 174         open:   shm_open,       /* callback for a new vm-area open */
 175         close:  shm_close,      /* callback for when the vm-area is released */
 176         nopage: shm_nopage,
 177         swapout:shm_swapout,
 178 };
 179
 180 size_t shm_ctlmax = SHMMAX;
 181
 182 /* These parameters should be part of the superblock */
 183 static int shm_ctlall;
 184 static int shm_ctlmni;
 185 static int shm_mode;
 186
 187 static int shm_tot; /* total number of shared memory pages */
 188 static int shm_rss; /* number of shared memory pages that are in memory */
 189 static int shm_swp; /* number of shared memory pages that are in swap */
 190
 191 /* locks order:
 192         pagecache_lock
 193         shm_lock()/shm_lockall()
 194         kernel lock
 195         inode->i_sem
 196         sem_ids.sem
 197         mmap_sem
 198
 199   SMP assumptions:
 200   - swap_free() never sleeps
 201   - add_to_swap_cache() never sleeps
 202   - add_to_swap_cache() doesn't acquire the big kernel lock.
 203   - shm_unuse() is called with the kernel lock acquired.
 204  */
 205
 206 /* some statistics */
 207 static ulong swap_attempts;
 208 static ulong swap_successes;
 209 static ulong used_segs;
 210
 211 void __init shm_init (void)
 212 {
 213         struct vfsmount *res;
 214         ipc_init_ids(&shm_ids, 1);
 215
 216         register_filesystem (&shm_fs_type);
 217         res = kern_mount(&shm_fs_type);
 218         if (IS_ERR(res)) {
 219                 unregister_filesystem(&shm_fs_type);
 220                 return;
 221         }
 222 #ifdef CONFIG_PROC_FS
 223         create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 224 #endif
 225         zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, 1);
 226         shm_unlock(zero_id);
 227         INIT_LIST_HEAD(&zshmid_kernel.zero_list);
 228         zdent = d_alloc_root(get_empty_inode());
 229         return;
 230 }
 231
 232 static int shm_parse_options(char *options)
 233 {
 234         int blocks = shm_ctlall;
 235         int inodes = shm_ctlmni;
 236         umode_t mode = shm_mode;
 237         char *this_char, *value;
 238
 239         this_char = NULL;
 240         if ( options )
 241                 this_char = strtok(options,",");
 242         for ( ; this_char; this_char = strtok(NULL,",")) {
 243                 if ((value = strchr(this_char,'=')) != NULL)
 244                         *value++ = 0;
 245                 if (!strcmp(this_char,"nr_blocks")) {
 246                         if (!value || !*value)
 247                                 return 1;
 248                         blocks = simple_strtoul(value,&value,0);
 249                         if (*value)
 250                                 return 1;
 251                 }
 252                 else if (!strcmp(this_char,"nr_inodes")) {
 253                         if (!value || !*value)
 254                                 return 1;
 255                         inodes = simple_strtoul(value,&value,0);
 256                         if (*value)
 257                                 return 1;
 258                 }
 259                 else if (!strcmp(this_char,"mode")) {
 260                         if (!value || !*value)
 261                                 return 1;
 262                         mode = simple_strtoul(value,&value,8);
 263                         if (*value)
 264                                 return 1;
 265                 }
 266                 else
 267                         return 1;
 268         }
 269         shm_ctlmni = inodes;
 270         shm_ctlall = blocks;
 271         shm_mode   = mode;
 272
 273         return 0;
 274 }
 275
 276 static struct super_block *shm_read_super(struct super_block *s,void *data,
 277                                           int silent)
 278 {
 279         struct inode * root_inode;
 280
 281         shm_ctlall = SHMALL;
 282         shm_ctlmni = SHMMNI;
 283         shm_mode   = S_IRWXUGO | S_ISVTX;
 284         if (shm_parse_options (data)) {
 285                 printk(KERN_ERR "shm fs invalid option\n");
 286                 goto out_unlock;
 287         }
 288
 289         s->s_blocksize = PAGE_SIZE;
 290         s->s_blocksize_bits = PAGE_SHIFT;
 291         s->s_magic = SHM_FS_MAGIC;
 292         s->s_op = &shm_sops;
 293         root_inode = iget (s, SEQ_MULTIPLIER);
 294         if (!root_inode)
 295                 goto out_no_root;
 296         root_inode->i_op = &shm_root_inode_operations;
 297         root_inode->i_sb = s;
 298         root_inode->i_nlink = 2;
 299         root_inode->i_mode = S_IFDIR | shm_mode;
 300         s->s_root = d_alloc_root(root_inode);
 301         if (!s->s_root)
 302                 goto out_no_root;
 303         shm_sb = s;
 304         return s;
 305
 306 out_no_root:
 307         printk(KERN_ERR "shm_read_super: get root inode failed\n");
 308         iput(root_inode);
 309 out_unlock:
 310         return NULL;
 311 }
 312
 313 static int shm_remount_fs (struct super_block *sb, int *flags, char *data)
 314 {
 315         if (shm_parse_options (data))
 316                 return -EINVAL;
 317         return 0;
 318 }
 319
 320 static inline int shm_checkid(struct shmid_kernel *s, int id)
 321 {
 322         if (!(s->shm_flags & SHM_SYSV))
 323                 return -EINVAL;
 324         if (ipc_checkid(&shm_ids,&s->shm_perm,id))
 325                 return -EIDRM;
 326         return 0;
 327 }
 328
 329 static inline struct shmid_kernel *shm_rmid(int id)
 330 {
 331         return (struct shmid_kernel *)ipc_rmid(&shm_ids,id);
 332 }
 333
 334 static inline int shm_addid(struct shmid_kernel *shp)
 335 {
 336         return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);
 337 }
 338
 339 static void shm_put_super(struct super_block *sb)
 340 {
 341         int i;
 342         struct shmid_kernel *shp;
 343
 344         down(&shm_ids.sem);
 345         for(i = 0; i <= shm_ids.max_id; i++) {
 346                 if (i == zero_id)
 347                         continue;
 348                 if (!(shp = shm_lock (i)))
 349                         continue;
 350                 if (shp->shm_nattch)
 351                         printk(KERN_DEBUG "shm_nattch = %ld\n", shp->shm_nattch);
 352                 shp = shm_rmid(i);
 353                 shm_unlock(i);
 354                 seg_free(shp, 1);
 355         }
 356         dput (sb->s_root);
 357         up(&shm_ids.sem);
 358 }
 359
 360 static int shm_statfs(struct super_block *sb, struct statfs *buf)
 361 {
 362         buf->f_type = SHM_FS_MAGIC;
 363         buf->f_bsize = PAGE_SIZE;
 364         buf->f_blocks = shm_ctlall;
 365         buf->f_bavail = buf->f_bfree = shm_ctlall - shm_tot;
 366         buf->f_files = shm_ctlmni;
 367         buf->f_ffree = shm_ctlmni - used_segs;
 368         buf->f_namelen = SHM_NAME_LEN;
 369         return 0;
 370 }
 371
 372 static void shm_read_inode(struct inode * inode)
 373 {
 374         int id;
 375         struct shmid_kernel *shp;
 376
 377         id = inode->i_ino;
 378         inode->i_op = NULL;
 379         inode->i_mode = 0;
 380         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 381
 382         if (id < SEQ_MULTIPLIER) {
 383                 if (!(shp = shm_lock (id)))
 384                         return;
 385                 inode->i_mode = (shp->shm_flags & S_IALLUGO) | S_IFREG;
 386                 inode->i_uid  = shp->shm_perm.uid;
 387                 inode->i_gid  = shp->shm_perm.gid;
 388                 inode->i_size = shp->shm_segsz;
 389                 shm_unlock (id);
 390                 inode->i_op  = &shm_inode_operations;
 391                 inode->i_fop = &shm_file_operations;
 392                 return;
 393         }
 394         inode->i_op    = &shm_root_inode_operations;
 395         inode->i_fop   = &shm_root_operations;
 396         inode->i_sb    = shm_sb;
 397         inode->i_nlink = 2;
 398         inode->i_mode  = S_IFDIR | shm_mode;
 399         inode->i_uid   = inode->i_gid = 0;
 400
 401 }
 402
 403 static int shm_create (struct inode *dir, struct dentry *dent, int mode)
 404 {
 405         int id, err;
 406         struct inode * inode;
 407
 408         down(&shm_ids.sem);
 409         err = id = newseg (IPC_PRIVATE, dent->d_name.name, dent->d_name.len, mode, 0);
 410         if (err < 0)
 411                 goto out;
 412
 413         err = -ENOMEM;
 414         inode = iget (shm_sb, id % SEQ_MULTIPLIER);
 415         if (!inode)
 416                 goto out;
 417
 418         err = 0;
 419         down (&inode->i_sem);
 420         inode->i_mode = mode | S_IFREG;
 421         inode->i_op   = &shm_inode_operations;
 422         d_instantiate(dent, inode);
 423         up (&inode->i_sem);
 424
 425 out:
 426         up(&shm_ids.sem);
 427         return err;
 428 }
 429
 430 static int shm_readdir (struct file *filp, void *dirent, filldir_t filldir)
 431 {
 432         struct inode * inode = filp->f_dentry->d_inode;
 433         struct shmid_kernel *shp;
 434         off_t nr;
 435
 436         nr = filp->f_pos;
 437
 438         switch(nr)
 439         {
 440         case 0:
 441                 if (filldir(dirent, ".", 1, nr, inode->i_ino, DT_DIR) < 0)
 442                         return 0;
 443                 filp->f_pos = ++nr;
 444                 /* fall through */
 445         case 1:
 446                 if (filldir(dirent, "..", 2, nr, inode->i_ino, DT_DIR) < 0)
 447                         return 0;
 448                 filp->f_pos = ++nr;
 449                 /* fall through */
 450         default:
 451                 down(&shm_ids.sem);
 452                 for (; nr-2 <= shm_ids.max_id; nr++ ) {
 453                         if (nr-2 == zero_id)
 454                                 continue;
 455                         if (!(shp = shm_get (nr-2)))
 456                                 continue;
 457                         if (shp->shm_flags & SHM_UNLK)
 458                                 continue;
 459                         if (filldir(dirent, shp->shm_name, shp->shm_namelen, nr, nr, DT_REG) < 0 )
 460                                 break;;
 461                 }
 462                 filp->f_pos = nr;
 463                 up(&shm_ids.sem);
 464                 break;
 465         }
 466
 467         UPDATE_ATIME(inode);
 468         return 0;
 469 }
 470
 471 static struct dentry *shm_lookup (struct inode *dir, struct dentry *dent)
 472 {
 473         int i, err = 0;
 474         struct shmid_kernel* shp;
 475         struct inode *inode = NULL;
 476
 477         if (dent->d_name.len > SHM_NAME_LEN)
 478                 return ERR_PTR(-ENAMETOOLONG);
 479
 480         down(&shm_ids.sem);
 481         for(i = 0; i <= shm_ids.max_id; i++) {
 482                 if (i == zero_id)
 483                         continue;
 484                 if (!(shp = shm_lock(i)))
 485                         continue;
 486                 if (!(shp->shm_flags & SHM_UNLK) &&
 487                     dent->d_name.len == shp->shm_namelen &&
 488                     strncmp(dent->d_name.name, shp->shm_name, shp->shm_namelen) == 0)
 489                         goto found;
 490                 shm_unlock(i);
 491         }
 492
 493         /*
 494          * prevent the reserved names as negative dentries.
 495          * This also prevents object creation through the filesystem
 496          */
 497         if (dent->d_name.len == SHM_FMT_LEN &&
 498             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 499                 err = -EINVAL;  /* EINVAL to give IPC_RMID the right error */
 500
 501         goto out;
 502
 503 found:
 504         shm_unlock(i);
 505         inode = iget(dir->i_sb, i);
 506
 507         if (!inode)
 508                 err = -EACCES;
 509 out:
 510         if (err == 0)
 511                 d_add (dent, inode);
 512         up (&shm_ids.sem);
 513         return ERR_PTR(err);
 514 }
 515
 516 static int shm_unlink (struct inode *dir, struct dentry *dent)
 517 {
 518         struct inode * inode = dent->d_inode;
 519         struct shmid_kernel *shp;
 520
 521         down (&shm_ids.sem);
 522         if (!(shp = shm_lock (inode->i_ino)))
 523                 BUG();
 524         shp->shm_flags |= SHM_UNLK | PRV_DEST;
 525         shp->shm_perm.key = IPC_PRIVATE; /* Do not find it any more */
 526         shm_unlock (inode->i_ino);
 527         up (&shm_ids.sem);
 528         inode->i_nlink -= 1;
 529         /*
 530          * If it's a reserved name we have to drop the dentry instead
 531          * of creating a negative dentry
 532          */
 533         if (dent->d_name.len == SHM_FMT_LEN &&
 534             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 535                 d_drop (dent);
 536         return 0;
 537 }
 538
 539 /*
 540  * We cannot use kmalloc for shm_alloc since this restricts the
 541  * maximum size of the segments.
 542  *
 543  * We also cannot use vmalloc, since this uses too much of the vmalloc
 544  * space and we run out of this on highend machines.
 545  *
 546  * So we have to use this complicated indirect scheme to alloc the shm
 547  * page tables.
 548  *
 549  */
 550
 551 #ifdef PTE_INIT
 552 static inline void init_ptes (pte_t *pte, int number) {
 553         while (number--)
 554                 PTE_INIT (pte++);
 555 }
 556 #else
 557 static inline void init_ptes (pte_t *pte, int number) {
 558         memset (pte, 0, number*sizeof(*pte));
 559 }
 560 #endif
 561
 562 #define PTES_PER_PAGE (PAGE_SIZE/sizeof(pte_t))
 563 #define SHM_ENTRY(shp, index) (shp)->shm_dir[(index)/PTES_PER_PAGE][(index)%PTES_PER_PAGE]
 564
 565 static pte_t **shm_alloc(unsigned long pages, int doacc)
 566 {
 567         unsigned short dir  = pages / PTES_PER_PAGE;
 568         unsigned short last = pages % PTES_PER_PAGE;
 569         pte_t **ret, **ptr;
 570
 571         if (pages == 0)
 572                 return NULL;
 573
 574         ret = kmalloc ((dir+1) * sizeof(pte_t *), GFP_KERNEL);
 575         if (!ret)
 576                 goto nomem;
 577
 578         for (ptr = ret; ptr < ret+dir ; ptr++)
 579         {
 580                 *ptr = (pte_t *)__get_free_page (GFP_KERNEL);
 581                 if (!*ptr)
 582                         goto free;
 583                 init_ptes (*ptr, PTES_PER_PAGE);
 584         }
 585
 586         /* The last one is probably not of PAGE_SIZE: we use kmalloc */
 587         if (last) {
 588                 *ptr = kmalloc (last*sizeof(pte_t), GFP_KERNEL);
 589                 if (!*ptr)
 590                         goto free;
 591                 init_ptes (*ptr, last);
 592         }
 593         if (doacc) {
 594                 shm_lockall();
 595                 shm_tot += pages;
 596                 used_segs++;
 597                 shm_unlockall();
 598         }
 599         return ret;
 600
 601 free:
 602         /* The last failed: we decrement first */
 603         while (--ptr >= ret)
 604                 free_page ((unsigned long)*ptr);
 605
 606         kfree (ret);
 607 nomem:
 608         return ERR_PTR(-ENOMEM);
 609 }
 610
 611 static void shm_free(pte_t** dir, unsigned long pages, int doacc)
 612 {
 613         int i, rss, swp;
 614         pte_t **ptr = dir+pages/PTES_PER_PAGE;
 615
 616         if (!dir)
 617                 return;
 618
 619         for (i = 0, rss = 0, swp = 0; i < pages ; i++) {
 620                 pte_t pte;
 621                 pte = dir[i/PTES_PER_PAGE][i%PTES_PER_PAGE];
 622                 if (pte_none(pte))
 623                         continue;
 624                 if (pte_present(pte)) {
 625                         __free_page (pte_page(pte));
 626                         rss++;
 627                 } else {
 628                         swap_free(pte_to_swp_entry(pte));
 629                         swp++;
 630                 }
 631         }
 632
 633         /* first the last page */
 634         if (pages%PTES_PER_PAGE)
 635                 kfree (*ptr);
 636         /* now the whole pages */
 637         while (--ptr >= dir)
 638                 if (*ptr)
 639                         free_page ((unsigned long)*ptr);
 640
 641         /* Now the indirect block */
 642         kfree (dir);
 643
 644         if (doacc) {
 645                 shm_lockall();
 646                 shm_rss -= rss;
 647                 shm_swp -= swp;
 648                 shm_tot -= pages;
 649                 used_segs--;
 650                 shm_unlockall();
 651         }
 652 }
 653
 654 static  int shm_setattr (struct dentry *dentry, struct iattr *attr)
 655 {
 656         int error;
 657         struct inode *inode = dentry->d_inode;
 658         struct shmid_kernel *shp;
 659         unsigned long new_pages, old_pages;
 660         pte_t **new_dir, **old_dir;
 661
 662         error = inode_change_ok(inode, attr);
 663         if (error)
 664                 return error;
 665         if (!(attr->ia_valid & ATTR_SIZE))
 666                 goto set_attr;
 667         if (attr->ia_size > shm_ctlmax)
 668                 return -EFBIG;
 669
 670         /* We set old_pages and old_dir for easier cleanup */
 671         old_pages = new_pages = (attr->ia_size  + PAGE_SIZE - 1) >> PAGE_SHIFT;
 672         old_dir = new_dir = shm_alloc(new_pages, 1);
 673         if (IS_ERR(new_dir))
 674                 return PTR_ERR(new_dir);
 675
 676         if (!(shp = shm_lock(inode->i_ino)))
 677                 BUG();
 678         error = -ENOSPC;
 679         if (shm_tot - shp->shm_npages >= shm_ctlall)
 680                 goto size_out;
 681         error = 0;
 682         if (shp->shm_segsz == attr->ia_size)
 683                 goto size_out;
 684         /* Now we set them to the real values */
 685         old_dir = shp->shm_dir;
 686         old_pages = shp->shm_npages;
 687         if (old_dir){
 688                 pte_t *swap;
 689                 int i,j;
 690                 i = old_pages < new_pages ? old_pages : new_pages;
 691                 j = i % PTES_PER_PAGE;
 692                 i /= PTES_PER_PAGE;
 693                 if (j)
 694                         memcpy (new_dir[i], old_dir[i], j * sizeof (pte_t));
 695                 while (i--) {
 696                         swap = new_dir[i];
 697                         new_dir[i] = old_dir[i];
 698                         old_dir[i] = swap;
 699                 }
 700         }
 701         shp->shm_dir = new_dir;
 702         shp->shm_npages = new_pages;
 703         shp->shm_segsz = attr->ia_size;
 704 size_out:
 705         shm_unlock(inode->i_ino);
 706         shm_free (old_dir, old_pages, 1);
 707
 708 set_attr:
 709         if (!(shp = shm_lock(inode->i_ino)))
 710                 BUG();
 711         if (attr->ia_valid & ATTR_MODE)
 712                 shp->shm_perm.mode = attr->ia_mode;
 713         if (attr->ia_valid & ATTR_UID)
 714                 shp->shm_perm.uid = attr->ia_uid;
 715         if (attr->ia_valid & ATTR_GID)
 716                 shp->shm_perm.gid = attr->ia_gid;
 717         shm_unlock (inode->i_ino);
 718
 719         inode_setattr(inode, attr);
 720         return error;
 721 }
 722
 723 static struct shmid_kernel *seg_alloc(int numpages, size_t namelen)
 724 {
 725         struct shmid_kernel *shp;
 726         pte_t              **dir;
 727
 728         shp = (struct shmid_kernel *) kmalloc (sizeof (*shp) + namelen, GFP_KERNEL);
 729         if (!shp)
 730                 return ERR_PTR(-ENOMEM);
 731
 732         dir = shm_alloc (numpages, namelen);
 733         if (IS_ERR(dir)) {
 734                 kfree(shp);
 735                 return ERR_PTR(PTR_ERR(dir));
 736         }
 737         shp->shm_dir    = dir;
 738         shp->shm_npages = numpages;
 739         shp->shm_nattch = 0;
 740         shp->shm_namelen = namelen;
 741         return(shp);
 742 }
 743
 744 static void seg_free(struct shmid_kernel *shp, int doacc)
 745 {
 746         shm_free (shp->shm_dir, shp->shm_npages, doacc);
 747         kfree(shp);
 748 }
 749
 750 static int newseg (key_t key, const char *name, int namelen,
 751                    int shmflg, size_t size)
 752 {
 753         struct shmid_kernel *shp;
 754         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 755         int id;
 756
 757         if (namelen > SHM_NAME_LEN)
 758                 return -ENAMETOOLONG;
 759
 760         if (size > shm_ctlmax)
 761                 return -EINVAL;
 762
 763         if (shm_tot + numpages >= shm_ctlall)
 764                 return -ENOSPC;
 765
 766         shp = seg_alloc(numpages, namelen ? namelen : SHM_FMT_LEN + 1);
 767         if (IS_ERR(shp))
 768                 return PTR_ERR(shp);
 769         id = shm_addid(shp);
 770         if(id == -1) {
 771                 seg_free(shp, 1);
 772                 return -ENOSPC;
 773         }
 774         shp->shm_perm.key = key;
 775         shp->shm_flags = (shmflg & S_IRWXUGO);
 776         shp->shm_segsz = size;
 777         shp->shm_cprid = current->pid;
 778         shp->shm_lprid = 0;
 779         shp->shm_atim = shp->shm_dtim = 0;
 780         shp->shm_ctim = CURRENT_TIME;
 781         shp->id = shm_buildid(id,shp->shm_perm.seq);
 782         if (namelen != 0) {
 783                 shp->shm_namelen = namelen;
 784                 memcpy (shp->shm_name, name, namelen);
 785         } else {
 786                 shp->shm_flags |= SHM_SYSV;
 787                 shp->shm_namelen = sprintf (shp->shm_name, SHM_FMT, shp->id);
 788         }
 789         shm_unlock(id);
 790
 791         return shp->id;
 792 }
 793
 794 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
 795 {
 796         struct shmid_kernel *shp;
 797         int err, id = 0;
 798
 799         if (size < SHMMIN)
 800                 return -EINVAL;
 801
 802         down(&shm_ids.sem);
 803         if (key == IPC_PRIVATE) {
 804                 err = newseg(key, NULL, 0, shmflg, size);
 805         } else if ((id = ipc_findkey(&shm_ids,key)) == -1) {
 806                 if (!(shmflg & IPC_CREAT))
 807                         err = -ENOENT;
 808                 else
 809                         err = newseg(key, NULL, 0, shmflg, size);
 810         } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
 811                 err = -EEXIST;
 812         } else {
 813                 shp = shm_lock(id);
 814                 if(shp==NULL)
 815                         BUG();
 816                 if (shp->shm_segsz < size)
 817                         err = -EINVAL;
 818                 else if (ipcperms(&shp->shm_perm, shmflg))
 819                         err = -EACCES;
 820                 else
 821                         err = shm_buildid(id, shp->shm_perm.seq);
 822                 shm_unlock(id);
 823         }
 824         up(&shm_ids.sem);
 825         return err;
 826 }
 827
 828 /* FIXME: maybe we need lock_kernel() here */
 829 static void shm_delete (struct inode *ino)
 830 {
 831         int shmid = ino->i_ino;
 832         struct shmid_kernel *shp;
 833
 834         down(&shm_ids.sem);
 835         shp = shm_lock(shmid);
 836         if(shp==NULL) {
 837                 BUG();
 838         }
 839         shp = shm_rmid(shmid);
 840         shm_unlock(shmid);
 841         up(&shm_ids.sem);
 842         seg_free(shp, 1);
 843         clear_inode(ino);
 844 }
 845
 846 static inline unsigned long copy_shmid_to_user(void *buf, struct shmid64_ds *in, int version)
 847 {
 848         switch(version) {
 849         case IPC_64:
 850                 return copy_to_user(buf, in, sizeof(*in));
 851         case IPC_OLD:
 852             {
 853                 struct shmid_ds out;
 854
 855                 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
 856                 out.shm_segsz   = in->shm_segsz;
 857                 out.shm_atime   = in->shm_atime;
 858                 out.shm_dtime   = in->shm_dtime;
 859                 out.shm_ctime   = in->shm_ctime;
 860                 out.shm_cpid    = in->shm_cpid;
 861                 out.shm_lpid    = in->shm_lpid;
 862                 out.shm_nattch  = in->shm_nattch;
 863
 864                 return copy_to_user(buf, &out, sizeof(out));
 865             }
 866         default:
 867                 return -EINVAL;
 868         }
 869 }
 870
 871 struct shm_setbuf {
 872         uid_t   uid;
 873         gid_t   gid;
 874         mode_t  mode;
 875 };
 876
 877 static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void *buf, int version)
 878 {
 879         switch(version) {
 880         case IPC_64:
 881             {
 882                 struct shmid64_ds tbuf;
 883
 884                 if (copy_from_user(&tbuf, buf, sizeof(tbuf)))
 885                         return -EFAULT;
 886
 887                 out->uid        = tbuf.shm_perm.uid;
 888                 out->gid        = tbuf.shm_perm.gid;
 889                 out->mode       = tbuf.shm_flags;
 890
 891                 return 0;
 892             }
 893         case IPC_OLD:
 894             {
 895                 struct shmid_ds tbuf_old;
 896
 897                 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
 898                         return -EFAULT;
 899
 900                 out->uid        = tbuf_old.shm_perm.uid;
 901                 out->gid        = tbuf_old.shm_perm.gid;
 902                 out->mode       = tbuf_old.shm_flags;
 903
 904                 return 0;
 905             }
 906         default:
 907                 return -EINVAL;
 908         }
 909 }
 910
 911 static inline unsigned long copy_shminfo_to_user(void *buf, struct shminfo64 *in, int version)
 912 {
 913         switch(version) {
 914         case IPC_64:
 915                 return copy_to_user(buf, in, sizeof(*in));
 916         case IPC_OLD:
 917             {
 918                 struct shminfo out;
 919
 920                 if(in->shmmax > INT_MAX)
 921                         out.shmmax = INT_MAX;
 922                 else
 923                         out.shmmax = (int)in->shmmax;
 924
 925                 out.shmmin      = in->shmmin;
 926                 out.shmmni      = in->shmmni;
 927                 out.shmseg      = in->shmseg;
 928                 out.shmall      = in->shmall;
 929
 930                 return copy_to_user(buf, &out, sizeof(out));
 931             }
 932         default:
 933                 return -EINVAL;
 934         }
 935 }
 936
 937 asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 938 {
 939         struct shm_setbuf setbuf;
 940         struct shmid_kernel *shp;
 941         int err, version;
 942
 943         if (cmd < 0 || shmid < 0)
 944                 return -EINVAL;
 945
 946         version = ipc_parse_version(&cmd);
 947
 948         switch (cmd) { /* replace with proc interface ? */
 949         case IPC_INFO:
 950         {
 951                 struct shminfo64 shminfo;
 952
 953                 memset(&shminfo,0,sizeof(shminfo));
 954                 shminfo.shmmni = shminfo.shmseg = shm_ctlmni;
 955                 shminfo.shmmax = shm_ctlmax;
 956                 shminfo.shmall = shm_ctlall;
 957
 958                 shminfo.shmmin = SHMMIN;
 959                 if(copy_shminfo_to_user (buf, &shminfo, version))
 960                         return -EFAULT;
 961                 /* reading a integer is always atomic */
 962                 err= shm_ids.max_id;
 963                 if(err<0)
 964                         err = 0;
 965                 return err;
 966         }
 967         case SHM_INFO:
 968         {
 969                 struct shm_info shm_info;
 970
 971                 memset(&shm_info,0,sizeof(shm_info));
 972                 shm_lockall();
 973                 shm_info.used_ids = shm_ids.in_use;
 974                 shm_info.shm_rss = shm_rss;
 975                 shm_info.shm_tot = shm_tot;
 976                 shm_info.shm_swp = shm_swp;
 977                 shm_info.swap_attempts = swap_attempts;
 978                 shm_info.swap_successes = swap_successes;
 979                 err = shm_ids.max_id;
 980                 shm_unlockall();
 981                 if(copy_to_user (buf, &shm_info, sizeof(shm_info)))
 982                         return -EFAULT;
 983
 984                 return err < 0 ? 0 : err;
 985         }
 986         case SHM_STAT:
 987         case IPC_STAT:
 988         {
 989                 struct shmid64_ds tbuf;
 990                 int result;
 991                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
 992                         return -EINVAL;
 993                 memset(&tbuf, 0, sizeof(tbuf));
 994                 shp = shm_lock(shmid);
 995                 if(shp==NULL)
 996                         return -EINVAL;
 997                 if(cmd==SHM_STAT) {
 998                         err = -EINVAL;
 999                         if (!(shp->shm_flags & SHM_SYSV) ||
1000                             shmid > shm_ids.max_id)
1001                                 goto out_unlock;
1002                         result = shm_buildid(shmid, shp->shm_perm.seq);
1003                 } else {
1004                         err = shm_checkid(shp,shmid);
1005                         if(err)
1006                                 goto out_unlock;
1007                         result = 0;
1008                 }
1009                 err=-EACCES;
1010                 if (ipcperms (&shp->shm_perm, S_IRUGO))
1011                         goto out_unlock;
1012                 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
1013                 /* ugly hack to keep binary compatibility for ipcs */
1014                 tbuf.shm_flags &= PRV_DEST | PRV_LOCKED | S_IRWXUGO;
1015                 if (tbuf.shm_flags & PRV_DEST)
1016                         tbuf.shm_flags |= SHM_DEST;
1017                 if (tbuf.shm_flags & PRV_LOCKED)
1018                         tbuf.shm_flags |= SHM_LOCKED;
1019                 tbuf.shm_flags &= SHM_DEST | SHM_LOCKED | S_IRWXUGO;
1020                 tbuf.shm_segsz  = shp->shm_segsz;
1021                 tbuf.shm_atime  = shp->shm_atim;
1022                 tbuf.shm_dtime  = shp->shm_dtim;
1023                 tbuf.shm_ctime  = shp->shm_ctim;
1024                 tbuf.shm_cpid   = shp->shm_cprid;
1025                 tbuf.shm_lpid   = shp->shm_lprid;
1026                 tbuf.shm_nattch = shp->shm_nattch;
1027                 shm_unlock(shmid);
1028                 if(copy_shmid_to_user (buf, &tbuf, version))
1029                         return -EFAULT;
1030                 return result;
1031         }
1032         case SHM_LOCK:
1033         case SHM_UNLOCK:
1034         {
1035 /* Allow superuser to lock segment in memory */
1036 /* Should the pages be faulted in here or leave it to user? */
1037 /* need to determine interaction with current->swappable */
1038                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1039                         return -EINVAL;
1040                 if (!capable(CAP_IPC_LOCK))
1041                         return -EPERM;
1042
1043                 shp = shm_lock(shmid);
1044                 if(shp==NULL)
1045                         return -EINVAL;
1046                 err = shm_checkid(shp,shmid);
1047                 if(err)
1048                         goto out_unlock;
1049                 if(cmd==SHM_LOCK)
1050                         shp->shm_flags |= PRV_LOCKED;
1051                 else
1052                         shp->shm_flags &= ~PRV_LOCKED;
1053                 shm_unlock(shmid);
1054                 return err;
1055         }
1056         case IPC_RMID:
1057         {
1058                 /*
1059                  *      We cannot simply remove the file. The SVID states
1060                  *      that the block remains until the last person
1061                  *      detaches from it, then is deleted. A shmat() on
1062                  *      an RMID segment is legal in older Linux and if
1063                  *      we change it apps break...
1064                  *
1065                  *      Instead we set a destroyed flag, and then blow
1066                  *      the name away when the usage hits zero.
1067                  */
1068                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
1069                         return -EINVAL;
1070                 down(&shm_ids.sem);
1071                 shp = shm_lock(shmid);
1072                 if (shp == NULL) {
1073                         up(&shm_ids.sem);
1074                         return -EINVAL;
1075                 }
1076                 err = shm_checkid(shp, shmid);
1077                 if (err == 0) {
1078                         if (shp->shm_nattch == 0 &&
1079                             !(shp->shm_flags & SHM_UNLK)) {
1080                                 int id=shp->id;
1081                                 shm_unlock(shmid);
1082                                 up(&shm_ids.sem);
1083                                 /*
1084                                  * We can't hold shm_lock here else we
1085                                  * will deadlock in shm_lookup when we
1086                                  * try to recursively grab it.
1087                                  */
1088                                 return shm_remove_name(id);
1089                         }
1090                         shp->shm_flags |= PRV_DEST;
1091                         /* Do not find it any more */
1092                         shp->shm_perm.key = IPC_PRIVATE;
1093                 }
1094                 /* Unlock */
1095                 shm_unlock(shmid);
1096                 up(&shm_ids.sem);
1097                 return err;
1098         }
1099
1100         case IPC_SET:
1101         {
1102                 struct dentry * dentry;
1103                 char name[SHM_FMT_LEN+1];
1104
1105                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1106                         return -EINVAL;
1107
1108                 if(copy_shmid_from_user (&setbuf, buf, version))
1109                         return -EFAULT;
1110                 down(&shm_ids.sem);
1111                 shp = shm_lock(shmid);
1112                 err=-EINVAL;
1113                 if(shp==NULL)
1114                         goto out_up;
1115                 err = shm_checkid(shp,shmid);
1116                 if(err)
1117                         goto out_unlock_up;
1118                 err=-EPERM;
1119                 if (current->euid != shp->shm_perm.uid &&
1120                     current->euid != shp->shm_perm.cuid &&
1121                     !capable(CAP_SYS_ADMIN)) {
1122                         goto out_unlock_up;
1123                 }
1124
1125                 shp->shm_perm.uid = setbuf.uid;
1126                 shp->shm_perm.gid = setbuf.gid;
1127                 shp->shm_flags = (shp->shm_flags & ~S_IRWXUGO)
1128                         | (setbuf.mode & S_IRWXUGO);
1129                 shp->shm_ctim = CURRENT_TIME;
1130                 shm_unlock(shmid);
1131                 up(&shm_ids.sem);
1132
1133                 sprintf (name, SHM_FMT, shmid);
1134                 dentry = lookup_one(name, lock_parent(shm_sb->s_root));
1135                 unlock_dir(shm_sb->s_root);
1136                 err = PTR_ERR(dentry);
1137                 if (IS_ERR(dentry))
1138                         goto bad_dentry;
1139                 err = -ENOENT;
1140                 if (dentry->d_inode) {
1141                         struct inode *ino = dentry->d_inode;
1142                         ino->i_uid   = setbuf.uid;
1143                         ino->i_gid   = setbuf.gid;
1144                         ino->i_mode  = (setbuf.mode & S_IRWXUGO) | (ino->i_mode & ~S_IALLUGO);;
1145                         ino->i_atime = ino->i_mtime = ino->i_ctime = CURRENT_TIME;
1146                         err = 0;
1147                 }
1148                 dput (dentry);
1149         bad_dentry:
1150                 return err;
1151         }
1152
1153         default:
1154                 return -EINVAL;
1155         }
1156
1157         err = 0;
1158 out_unlock_up:
1159         shm_unlock(shmid);
1160 out_up:
1161         up(&shm_ids.sem);
1162         return err;
1163 out_unlock:
1164         shm_unlock(shmid);
1165         return err;
1166 }
1167
1168 static inline void shm_inc (int id) {
1169         struct shmid_kernel *shp;
1170
1171         if(!(shp = shm_lock(id)))
1172                 BUG();
1173         shp->shm_atim = CURRENT_TIME;
1174         shp->shm_lprid = current->pid;
1175         shp->shm_nattch++;
1176         shm_unlock(id);
1177 }
1178
1179 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
1180 {
1181         if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
1182                 return -EINVAL; /* we cannot do private writable mappings */
1183         UPDATE_ATIME(file->f_dentry->d_inode);
1184         vma->vm_ops = &shm_vm_ops;
1185         shm_inc(file->f_dentry->d_inode->i_ino);
1186         return 0;
1187 }
1188
1189 /*
1190  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1191  */
1192 asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
1193 {
1194         struct shmid_kernel *shp;
1195         unsigned long addr;
1196         struct file * file;
1197         int    err;
1198         unsigned long flags;
1199         unsigned long prot;
1200         unsigned long o_flags;
1201         int acc_mode;
1202         struct dentry *dentry;
1203         char   name[SHM_FMT_LEN+1];
1204
1205         if (!shm_sb || (shmid % SEQ_MULTIPLIER) == zero_id)
1206                 return -EINVAL;
1207
1208         if ((addr = (ulong)shmaddr)) {
1209                 if (addr & (SHMLBA-1)) {
1210                         if (shmflg & SHM_RND)
1211                                 addr &= ~(SHMLBA-1);       /* round down */
1212                         else
1213                                 return -EINVAL;
1214                 }
1215                 flags = MAP_SHARED | MAP_FIXED;
1216         } else
1217                 flags = MAP_SHARED;
1218
1219         if (shmflg & SHM_RDONLY) {
1220                 prot = PROT_READ;
1221                 o_flags = O_RDONLY;
1222                 acc_mode = S_IRUGO;
1223         } else {
1224                 prot = PROT_READ | PROT_WRITE;
1225                 o_flags = O_RDWR;
1226                 acc_mode = S_IRUGO | S_IWUGO;
1227         }
1228
1229         /*
1230          * We cannot rely on the fs check since SYSV IPC does have an
1231          * aditional creator id...
1232          */
1233         shp = shm_lock(shmid);
1234         if(shp==NULL)
1235                 return -EINVAL;
1236         err = ipcperms(&shp->shm_perm, acc_mode);
1237         shm_unlock(shmid);
1238         if (err)
1239                 return -EACCES;
1240
1241         sprintf (name, SHM_FMT, shmid);
1242
1243         mntget(shm_fs_type.kern_mnt);
1244         dentry = lookup_one(name, lock_parent(shm_sb->s_root));
1245         unlock_dir(shm_sb->s_root);
1246         err = PTR_ERR(dentry);
1247         if (IS_ERR(dentry))
1248                 goto bad_file;
1249         err = -ENOENT;
1250         if (!dentry->d_inode)
1251                 goto bad_file;
1252         file = dentry_open(dentry, shm_fs_type.kern_mnt, o_flags);
1253         err = PTR_ERR(file);
1254         if (IS_ERR (file))
1255                 goto bad_file1;
1256         down(&current->mm->mmap_sem);
1257         *raddr = do_mmap (file, addr, file->f_dentry->d_inode->i_size,
1258                           prot, flags, 0);
1259         up(&current->mm->mmap_sem);
1260         if (IS_ERR(*raddr))
1261                 err = PTR_ERR(*raddr);
1262         else
1263                 err = 0;
1264         fput (file);
1265         return err;
1266
1267 bad_file1:
1268         dput(dentry);
1269 bad_file:
1270         mntput(shm_fs_type.kern_mnt);
1271         if (err == -ENOENT)
1272                 return -EINVAL;
1273         return err;
1274 }
1275
1276 /* This is called by fork, once for every shm attach. */
1277 static void shm_open (struct vm_area_struct *shmd)
1278 {
1279         shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino);
1280 }
1281
1282 /*
1283  *      Remove a name.
1284  */
1285
1286 static int shm_remove_name(int id)
1287 {
1288         struct dentry *dir;
1289         struct dentry *dentry;
1290         int error;
1291         char name[SHM_FMT_LEN+1];
1292
1293         sprintf (name, SHM_FMT, id);
1294         dir = lock_parent(shm_sb->s_root);
1295         dentry = lookup_one(name, dir);
1296         error = PTR_ERR(dentry);
1297         if (!IS_ERR(dentry)) {
1298                 /*
1299                  * We have to do our own unlink to prevent the vfs
1300                  * permission check. The SYSV IPC layer has already
1301                  * checked the permissions which do not comply to the
1302                  * vfs rules.
1303                  */
1304                 struct inode *inode = dir->d_inode;
1305                 down(&inode->i_zombie);
1306                 error = shm_unlink(inode, dentry);
1307                 if (!error)
1308                         d_delete(dentry);
1309                 up(&inode->i_zombie);
1310                 dput(dentry);
1311         }
1312         unlock_dir(dir);
1313         return error;
1314 }
1315
1316 /*
1317  * remove the attach descriptor shmd.
1318  * free memory for segment if it is marked destroyed.
1319  * The descriptor has already been removed from the current->mm->mmap list
1320  * and will later be kfree()d.
1321  */
1322 static void shm_close (struct vm_area_struct *shmd)
1323 {
1324         int id = shmd->vm_file->f_dentry->d_inode->i_ino;
1325         struct shmid_kernel *shp;
1326
1327         /* remove from the list of attaches of the shm segment */
1328         if(!(shp = shm_lock(id)))
1329                 BUG();
1330         shp->shm_lprid = current->pid;
1331         shp->shm_dtim = CURRENT_TIME;
1332         shp->shm_nattch--;
1333         if(shp->shm_nattch == 0 &&
1334            shp->shm_flags & PRV_DEST &&
1335            !(shp->shm_flags & SHM_UNLK)) {
1336                 int pid=shp->id;
1337                 int err;
1338                 shm_unlock(id);
1339
1340                 /* The kernel lock prevents new attaches from
1341                  * being happening.  We can't hold shm_lock here
1342                  * else we will deadlock in shm_lookup when we
1343                  * try to recursively grab it.
1344                  */
1345                 err = shm_remove_name(pid);
1346                 if(err && err != -EINVAL && err != -ENOENT)
1347                         printk(KERN_ERR "Unlink of SHM id %d failed (%d).\n", pid, err);
1348
1349         } else {
1350                 shm_unlock(id);
1351         }
1352 }
1353
1354 /*
1355  * detach and kill segment if marked destroyed.
1356  * The work is done in shm_close.
1357  */
1358 asmlinkage long sys_shmdt (char *shmaddr)
1359 {
1360         struct mm_struct *mm = current->mm;
1361         struct vm_area_struct *shmd, *shmdnext;
1362
1363         down(&mm->mmap_sem);
1364         for (shmd = mm->mmap; shmd; shmd = shmdnext) {
1365                 shmdnext = shmd->vm_next;
1366                 if (shmd->vm_ops == &shm_vm_ops
1367                     && shmd->vm_start - (shmd->vm_pgoff << PAGE_SHIFT) == (ulong) shmaddr)
1368                         do_munmap(mm, shmd->vm_start, shmd->vm_end - shmd->vm_start);
1369         }
1370         up(&mm->mmap_sem);
1371         return 0;
1372 }
1373
1374 /*
1375  * Enter the shm page into the SHM data structures.
1376  *
1377  * The way "nopage" is done, we don't actually have to
1378  * do anything here: nopage will have filled in the shm
1379  * data structures already, and shm_swap_out() will just
1380  * work off them..
1381  */
1382 static int shm_swapout(struct page * page, struct file *file)
1383 {
1384         return 0;
1385 }
1386
1387 /*
1388  * page not present ... go through shm_dir
1389  */
1390 static struct page * shm_nopage_core(struct shmid_kernel *shp, unsigned int idx, int *swp, int *rss, unsigned long address)
1391 {
1392         pte_t pte;
1393         struct page * page;
1394
1395         if (idx >= shp->shm_npages)
1396                 return NOPAGE_SIGBUS;
1397
1398         pte = SHM_ENTRY(shp,idx);
1399         if (!pte_present(pte)) {
1400                 /* page not present so shm_swap can't race with us
1401                    and the semaphore protects us by other tasks that
1402                    could potentially fault on our pte under us */
1403                 if (pte_none(pte)) {
1404                         shm_unlock(shp->id);
1405                         page = page_cache_alloc();
1406                         if (!page)
1407                                 goto oom;
1408                         clear_user_highpage(page, address);
1409                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1410                                 BUG();
1411                 } else {
1412                         swp_entry_t entry = pte_to_swp_entry(pte);
1413
1414                         shm_unlock(shp->id);
1415                         page = lookup_swap_cache(entry);
1416                         if (!page) {
1417                                 lock_kernel();
1418                                 swapin_readahead(entry);
1419                                 page = read_swap_cache(entry);
1420                                 unlock_kernel();
1421                                 if (!page)
1422                                         goto oom;
1423                         }
1424                         delete_from_swap_cache(page);
1425                         page = replace_with_highmem(page);
1426                         swap_free(entry);
1427                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1428                                 BUG();
1429                         (*swp)--;
1430                 }
1431                 (*rss)++;
1432                 pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1433                 SHM_ENTRY(shp, idx) = pte;
1434         }
1435
1436         /* pte_val(pte) == SHM_ENTRY (shp, idx) */
1437         page_cache_get(pte_page(pte));
1438         return pte_page(pte);
1439
1440 oom:
1441         shm_lock(shp->id);
1442         return NOPAGE_OOM;
1443 }
1444
1445 static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1446 {
1447         struct page * page;
1448         struct shmid_kernel *shp;
1449         unsigned int idx;
1450         struct inode * inode = shmd->vm_file->f_dentry->d_inode;
1451
1452         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1453         idx += shmd->vm_pgoff;
1454
1455         down(&inode->i_sem);
1456         if(!(shp = shm_lock(inode->i_ino)))
1457                 BUG();
1458         page = shm_nopage_core(shp, idx, &shm_swp, &shm_rss, address);
1459         shm_unlock(inode->i_ino);
1460         up(&inode->i_sem);
1461         return(page);
1462 }
1463
1464 #define OKAY    0
1465 #define RETRY   1
1466 #define FAILED  2
1467
1468 static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage)
1469 {
1470         pte_t page;
1471         struct page *page_map;
1472
1473         page = SHM_ENTRY(shp, idx);
1474         if (!pte_present(page))
1475                 return RETRY;
1476         page_map = pte_page(page);
1477         if (page_map->zone->free_pages > page_map->zone->pages_high)
1478                 return RETRY;
1479         if (shp->id != zero_id) swap_attempts++;
1480
1481         if (--*counter < 0) /* failed */
1482                 return FAILED;
1483         if (page_count(page_map) != 1)
1484                 return RETRY;
1485
1486         lock_page(page_map);
1487         if (!(page_map = prepare_highmem_swapout(page_map)))
1488                 return FAILED;
1489         SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
1490
1491         /* add the locked page to the swap cache before allowing
1492            the swapin path to run lookup_swap_cache(). This avoids
1493            reading a not yet uptodate block from disk.
1494            NOTE: we just accounted the swap space reference for this
1495            swap cache page at __get_swap_page() time. */
1496         add_to_swap_cache(*outpage = page_map, swap_entry);
1497         return OKAY;
1498 }
1499
1500 static void shm_swap_postop(struct page *page)
1501 {
1502         lock_kernel();
1503         rw_swap_page(WRITE, page, 0);
1504         unlock_kernel();
1505         page_cache_release(page);
1506 }
1507
1508 static int shm_swap_preop(swp_entry_t *swap_entry)
1509 {
1510         lock_kernel();
1511         /* subtle: preload the swap count for the swap cache. We can't
1512            increase the count inside the critical section as we can't release
1513            the shm_lock there. And we can't acquire the big lock with the
1514            shm_lock held (otherwise we would deadlock too easily). */
1515         *swap_entry = __get_swap_page(2);
1516         if (!(*swap_entry).val) {
1517                 unlock_kernel();
1518                 return 1;
1519         }
1520         unlock_kernel();
1521         return 0;
1522 }
1523
1524 /*
1525  * Goes through counter = (shm_rss / (prio + 1)) present shm pages.
1526  */
1527 static unsigned long swap_id; /* currently being swapped */
1528 static unsigned long swap_idx; /* next to swap */
1529
1530 int shm_swap (int prio, int gfp_mask)
1531 {
1532         struct shmid_kernel *shp;
1533         swp_entry_t swap_entry;
1534         unsigned long id, idx;
1535         int loop = 0;
1536         int counter;
1537         struct page * page_map;
1538
1539         zshm_swap(prio, gfp_mask);
1540         counter = shm_rss / (prio + 1);
1541         if (!counter)
1542                 return 0;
1543         if (shm_swap_preop(&swap_entry))
1544                 return 0;
1545
1546         shm_lockall();
1547 check_id:
1548         shp = shm_get(swap_id);
1549         if(shp==NULL || shp->shm_flags & PRV_LOCKED) {
1550 next_id:
1551                 swap_idx = 0;
1552                 if (++swap_id > shm_ids.max_id) {
1553                         swap_id = 0;
1554                         if (loop) {
1555 failed:
1556                                 shm_unlockall();
1557                                 __swap_free(swap_entry, 2);
1558                                 return 0;
1559                         }
1560                         loop = 1;
1561                 }
1562                 goto check_id;
1563         }
1564         id = swap_id;
1565
1566 check_table:
1567         idx = swap_idx++;
1568         if (idx >= shp->shm_npages)
1569                 goto next_id;
1570
1571         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1572                 case RETRY: goto check_table;
1573                 case FAILED: goto failed;
1574         }
1575         swap_successes++;
1576         shm_swp++;
1577         shm_rss--;
1578         shm_unlockall();
1579
1580         shm_swap_postop(page_map);
1581         return 1;
1582 }
1583
1584 /*
1585  * Free the swap entry and set the new pte for the shm page.
1586  */
1587 static void shm_unuse_page(struct shmid_kernel *shp, unsigned long idx,
1588                            swp_entry_t entry, struct page *page)
1589 {
1590         pte_t pte;
1591
1592         pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1593         SHM_ENTRY(shp, idx) = pte;
1594         page_cache_get(page);
1595         shm_rss++;
1596
1597         shm_swp--;
1598
1599         swap_free(entry);
1600 }
1601
1602 static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
1603 {
1604         int n;
1605
1606         for (n = 0; n < shp->shm_npages; n++) {
1607                 if (pte_none(SHM_ENTRY(shp,n)))
1608                         continue;
1609                 if (pte_present(SHM_ENTRY(shp,n)))
1610                         continue;
1611                 if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
1612                         shm_unuse_page(shp, n, entry, page);
1613                         return 1;
1614                 }
1615         }
1616         return 0;
1617 }
1618
1619 /*
1620  * unuse_shm() search for an eventually swapped out shm page.
1621  */
1622 void shm_unuse(swp_entry_t entry, struct page *page)
1623 {
1624         int i;
1625
1626         shm_lockall();
1627         for (i = 0; i <= shm_ids.max_id; i++) {
1628                 struct shmid_kernel *shp = shm_get(i);
1629                 if(shp==NULL)
1630                         continue;
1631                 if (shm_unuse_core(shp, entry, page))
1632                         goto out;
1633         }
1634 out:
1635         shm_unlockall();
1636         zmap_unuse(entry, page);
1637 }
1638
1639 #ifdef CONFIG_PROC_FS
1640 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
1641 {
1642         off_t pos = 0;
1643         off_t begin = 0;
1644         int i, len = 0;
1645
1646         down(&shm_ids.sem);
1647         len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime name\n");
1648
1649         for(i = 0; i <= shm_ids.max_id; i++) {
1650                 struct shmid_kernel* shp;
1651
1652                 if (i == zero_id)
1653                         continue;
1654                 shp = shm_lock(i);
1655                 if(shp!=NULL) {
1656 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1657 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1658                         char *format;
1659
1660                         if (sizeof(size_t) <= sizeof(int))
1661                                 format = SMALL_STRING;
1662                         else
1663                                 format = BIG_STRING;
1664                         len += sprintf(buffer + len, format,
1665                                 shp->shm_perm.key,
1666                                 shm_buildid(i, shp->shm_perm.seq),
1667                                 shp->shm_flags,
1668                                 shp->shm_segsz,
1669                                 shp->shm_cprid,
1670                                 shp->shm_lprid,
1671                                 shp->shm_nattch,
1672                                 shp->shm_perm.uid,
1673                                 shp->shm_perm.gid,
1674                                 shp->shm_perm.cuid,
1675                                 shp->shm_perm.cgid,
1676                                 shp->shm_atim,
1677                                 shp->shm_dtim,
1678                                 shp->shm_ctim,
1679                                 shp->shm_namelen,
1680                                 shp->shm_name,
1681                                 shp->shm_flags & SHM_UNLK ? " (deleted)" : "");
1682                         shm_unlock(i);
1683
1684                         pos += len;
1685                         if(pos < offset) {
1686                                 len = 0;
1687                                 begin = pos;
1688                         }
1689                         if(pos > offset + length)
1690                                 goto done;
1691                 }
1692         }
1693         *eof = 1;
1694 done:
1695         up(&shm_ids.sem);
1696         *start = buffer + (offset - begin);
1697         len -= (offset - begin);
1698         if(len > length)
1699                 len = length;
1700         if(len < 0)
1701                 len = 0;
1702         return len;
1703 }
1704 #endif
1705
1706 #define VMA_TO_SHP(vma)         ((vma)->vm_file->private_data)
1707
1708 static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
1709 static unsigned long zswap_idx; /* next to swap */
1710 static struct shmid_kernel *zswap_shp = &zshmid_kernel;
1711 static int zshm_rss;
1712
1713 static struct vm_operations_struct shmzero_vm_ops = {
1714         open:           shmzero_open,
1715         close:          shmzero_close,
1716         nopage:         shmzero_nopage,
1717         swapout:        shm_swapout,
1718 };
1719
1720 /*
1721  * In this implementation, the "unuse" and "swapout" interfaces are
1722  * interlocked out via the kernel_lock, as well as shm_lock(zero_id).
1723  * "unuse" and "nopage/swapin", as well as "swapout" and "nopage/swapin"
1724  * interlock via shm_lock(zero_id). All these interlocks can be based
1725  * on a per mapping lock instead of being a global lock.
1726  */
1727 /*
1728  * Reference (existance) counting on the file/dentry/inode is done
1729  * by generic vm_file code. The zero code does not hold any reference
1730  * on the pseudo-file. This is possible because the open/close calls
1731  * are bracketed by the file count update calls.
1732  */
1733 static struct file *file_setup(struct file *fzero, struct shmid_kernel *shp)
1734 {
1735         struct file *filp;
1736         struct inode *inp;
1737
1738         if ((filp = get_empty_filp()) == 0)
1739                 return(filp);
1740         if ((inp = get_empty_inode()) == 0) {
1741                 put_filp(filp);
1742                 return(0);
1743         }
1744         if ((filp->f_dentry = d_alloc(zdent, &(const struct qstr) { "dev/zero",
1745                                 8, 0 })) == 0) {
1746                 iput(inp);
1747                 put_filp(filp);
1748                 return(0);
1749         }
1750         filp->f_vfsmnt = mntget(shm_fs_type.kern_mnt);
1751         d_instantiate(filp->f_dentry, inp);
1752
1753         /*
1754          * Copy over dev/ino for benefit of procfs. Use
1755          * ino to indicate seperate mappings.
1756          */
1757         filp->f_dentry->d_inode->i_dev = shm_fs_type.kern_mnt->mnt_sb->s_dev;
1758         filp->f_dentry->d_inode->i_ino = (unsigned long)shp;
1759         if (fzero)
1760                 fput(fzero);    /* release /dev/zero file */
1761         return(filp);
1762 }
1763
1764 int map_zero_setup(struct vm_area_struct *vma)
1765 {
1766         extern int vm_enough_memory(long pages);
1767         struct shmid_kernel *shp;
1768         struct file *filp;
1769
1770         if (!vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))
1771                 return -ENOMEM;
1772         if (IS_ERR(shp = seg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE, 0)))
1773                 return PTR_ERR(shp);
1774         if ((filp = file_setup(vma->vm_file, shp)) == 0) {
1775                 seg_free(shp, 0);
1776                 return -ENOMEM;
1777         }
1778         vma->vm_file = filp;
1779         VMA_TO_SHP(vma) = (void *)shp;
1780         shp->id = zero_id;
1781         init_MUTEX(&shp->zsem);
1782         vma->vm_ops = &shmzero_vm_ops;
1783         shmzero_open(vma);
1784         spin_lock(&zmap_list_lock);
1785         list_add(&shp->zero_list, &zshmid_kernel.zero_list);
1786         spin_unlock(&zmap_list_lock);
1787         return 0;
1788 }
1789
1790 static void shmzero_open(struct vm_area_struct *shmd)
1791 {
1792         struct shmid_kernel *shp;
1793
1794         shp = VMA_TO_SHP(shmd);
1795         down(&shp->zsem);
1796         shp->shm_nattch++;
1797         up(&shp->zsem);
1798 }
1799
1800 static void shmzero_close(struct vm_area_struct *shmd)
1801 {
1802         int done = 0;
1803         struct shmid_kernel *shp;
1804
1805         shp = VMA_TO_SHP(shmd);
1806         down(&shp->zsem);
1807         if (--shp->shm_nattch == 0)
1808                 done = 1;
1809         up(&shp->zsem);
1810         if (done) {
1811                 spin_lock(&zmap_list_lock);
1812                 if (shp == zswap_shp)
1813                         zswap_shp = list_entry(zswap_shp->zero_list.next,
1814                                                 struct shmid_kernel, zero_list);
1815                 list_del(&shp->zero_list);
1816                 spin_unlock(&zmap_list_lock);
1817                 seg_free(shp, 0);
1818         }
1819 }
1820
1821 static struct page * shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1822 {
1823         struct page *page;
1824         struct shmid_kernel *shp;
1825         unsigned int idx;
1826         int dummy;
1827
1828         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1829         idx += shmd->vm_pgoff;
1830
1831         shp = VMA_TO_SHP(shmd);
1832         down(&shp->zsem);
1833         shm_lock(zero_id);
1834         page = shm_nopage_core(shp, idx, &dummy, &zshm_rss, address);
1835         shm_unlock(zero_id);
1836         up(&shp->zsem);
1837         return(page);
1838 }
1839
1840 static void zmap_unuse(swp_entry_t entry, struct page *page)
1841 {
1842         struct shmid_kernel *shp;
1843
1844         spin_lock(&zmap_list_lock);
1845         shm_lock(zero_id);
1846         for (shp = list_entry(zshmid_kernel.zero_list.next, struct shmid_kernel,
1847                         zero_list); shp != &zshmid_kernel;
1848                         shp = list_entry(shp->zero_list.next, struct shmid_kernel,
1849                                                                 zero_list)) {
1850                 if (shm_unuse_core(shp, entry, page))
1851                         break;
1852         }
1853         shm_unlock(zero_id);
1854         spin_unlock(&zmap_list_lock);
1855 }
1856
1857 static void zshm_swap (int prio, int gfp_mask)
1858 {
1859         struct shmid_kernel *shp;
1860         swp_entry_t swap_entry;
1861         unsigned long idx;
1862         int loop = 0;
1863         int counter;
1864         struct page * page_map;
1865
1866         counter = zshm_rss / (prio + 1);
1867         if (!counter)
1868                 return;
1869 next:
1870         if (shm_swap_preop(&swap_entry))
1871                 return;
1872
1873         spin_lock(&zmap_list_lock);
1874         shm_lock(zero_id);
1875         if (zshmid_kernel.zero_list.next == 0)
1876                 goto failed;
1877 next_id:
1878         if (zswap_shp == &zshmid_kernel) {
1879                 if (loop) {
1880 failed:
1881                         shm_unlock(zero_id);
1882                         spin_unlock(&zmap_list_lock);
1883                         __swap_free(swap_entry, 2);
1884                         return;
1885                 }
1886                 zswap_shp = list_entry(zshmid_kernel.zero_list.next,
1887                                         struct shmid_kernel, zero_list);
1888                 zswap_idx = 0;
1889                 loop = 1;
1890         }
1891         shp = zswap_shp;
1892
1893 check_table:
1894         idx = zswap_idx++;
1895         if (idx >= shp->shm_npages) {
1896                 zswap_shp = list_entry(zswap_shp->zero_list.next,
1897                                         struct shmid_kernel, zero_list);
1898                 zswap_idx = 0;
1899                 goto next_id;
1900         }
1901
1902         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1903                 case RETRY: goto check_table;
1904                 case FAILED: goto failed;
1905         }
1906         shm_unlock(zero_id);
1907         spin_unlock(&zmap_list_lock);
1908
1909         shm_swap_postop(page_map);
1910         if (counter)
1911                 goto next;
1912         return;
1913 }
1914